diff mbox series

[v3,13/13] net/mlx5: add e-switch VXLAN rule cleanup routines

Message ID 1541074741-41368-14-git-send-email-viacheslavo@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Shahaf Shuler
Headers show
Series net/mlx5: e-switch VXLAN encap/decap hardware offload | expand

Checks

Context Check Description
ci/Intel-compilation success Compilation OK

Commit Message

Slava Ovsiienko Nov. 1, 2018, 12:19 p.m. UTC
The last part of patchset contains the rule cleanup routines.
These ones is the part of outer interface initialization at
the moment of VXLAN VTEP attaching. These routines query
the list of attached VXLAN devices, the list of local IP
addresses with peer and link scope attribute and the list
of permanent neigh rules, then all found abovementioned
items on the specified outer device are flushed.

Suggested-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_flow_tcf.c | 495 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 494 insertions(+), 1 deletion(-)

Comments

Yongseok Koh Nov. 2, 2018, 12:01 a.m. UTC | #1
On Thu, Nov 01, 2018 at 05:19:35AM -0700, Slava Ovsiienko wrote:
> The last part of patchset contains the rule cleanup routines.
> These ones is the part of outer interface initialization at
> the moment of VXLAN VTEP attaching. These routines query
> the list of attached VXLAN devices, the list of local IP
> addresses with peer and link scope attribute and the list
> of permanent neigh rules, then all found abovementioned
> items on the specified outer device are flushed.
> 
> Suggested-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> ---
Acked-by: Yongseok Koh <yskoh@mellanox.com>

Thanks

>  drivers/net/mlx5/mlx5_flow_tcf.c | 495 ++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 494 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c
> index eae80ae..420816f 100644
> --- a/drivers/net/mlx5/mlx5_flow_tcf.c
> +++ b/drivers/net/mlx5/mlx5_flow_tcf.c
> @@ -3756,6 +3756,496 @@ struct pedit_parser {
>  #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
>  				 MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
>  
> +/* Data structures used by flow_tcf_xxx_cb() routines. */
> +struct tcf_nlcb_buf {
> +	LIST_ENTRY(tcf_nlcb_buf) next;
> +	uint32_t size;
> +	alignas(struct nlmsghdr)
> +	uint8_t msg[]; /**< Netlink message data. */
> +};
> +
> +struct tcf_nlcb_context {
> +	unsigned int ifindex; /**< Base interface index. */
> +	uint32_t bufsize;
> +	LIST_HEAD(, tcf_nlcb_buf) nlbuf;
> +};
> +
> +/**
> + * Allocate space for netlink command in buffer list
> + *
> + * @param[in, out] ctx
> + *   Pointer to callback context with command buffers list.
> + * @param[in] size
> + *   Required size of data buffer to be allocated.
> + *
> + * @return
> + *   Pointer to allocated memory, aligned as message header.
> + *   NULL if some error occurred.
> + */
> +static struct nlmsghdr *
> +flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
> +{
> +	struct tcf_nlcb_buf *buf;
> +	struct nlmsghdr *nlh;
> +
> +	size = NLMSG_ALIGN(size);
> +	buf = LIST_FIRST(&ctx->nlbuf);
> +	if (buf && (buf->size + size) <= ctx->bufsize) {
> +		nlh = (struct nlmsghdr *)&buf->msg[buf->size];
> +		buf->size += size;
> +		return nlh;
> +	}
> +	if (size > ctx->bufsize) {
> +		DRV_LOG(WARNING, "netlink: too long command buffer requested");
> +		return NULL;
> +	}
> +	buf = rte_malloc(__func__,
> +			ctx->bufsize + sizeof(struct tcf_nlcb_buf),
> +			alignof(struct tcf_nlcb_buf));
> +	if (!buf) {
> +		DRV_LOG(WARNING, "netlink: no memory for command buffer");
> +		return NULL;
> +	}
> +	LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
> +	buf->size = size;
> +	nlh = (struct nlmsghdr *)&buf->msg[0];
> +	return nlh;
> +}
> +
> +/**
> + * Set NLM_F_ACK flags in the last netlink command in buffer.
> + * Only last command in the buffer will be acked by system.
> + *
> + * @param[in, out] buf
> + *   Pointer to buffer with netlink commands.
> + */
> +static void
> +flow_tcf_setack_nlcmd(struct tcf_nlcb_buf *buf)
> +{
> +	struct nlmsghdr *nlh;
> +	uint32_t size = 0;
> +
> +	assert(buf->size);
> +	do {
> +		nlh = (struct nlmsghdr *)&buf->msg[size];
> +		size += NLMSG_ALIGN(nlh->nlmsg_len);
> +		if (size >= buf->size) {
> +			nlh->nlmsg_flags |= NLM_F_ACK;
> +			break;
> +		}
> +	} while (true);
> +}
> +
> +/**
> + * Send the buffers with prepared netlink commands. Scans the list and
> + * sends all found buffers. Buffers are sent and freed anyway in order
> + * to prevent memory leakage if some every message in received packet.
> + *
> + * @param[in] tcf
> + *   Context object initialized by mlx5_flow_tcf_context_create().
> + * @param[in, out] ctx
> + *   Pointer to callback context with command buffers list.
> + *
> + * @return
> + *   Zero value on success, negative errno value otherwise
> + *   and rte_errno is set.
> + */
> +static int
> +flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
> +		    struct tcf_nlcb_context *ctx)
> +{
> +	struct tcf_nlcb_buf *bc, *bn;
> +	struct nlmsghdr *nlh;
> +	int ret = 0;
> +
> +	bc = LIST_FIRST(&ctx->nlbuf);
> +	while (bc) {
> +		int rc;
> +
> +		bn = LIST_NEXT(bc, next);
> +		if (bc->size) {
> +			flow_tcf_setack_nlcmd(bc);
> +			nlh = (struct nlmsghdr *)&bc->msg;
> +			rc = flow_tcf_nl_ack(tcf, nlh, bc->size, NULL, NULL);
> +			if (rc && !ret)
> +				ret = rc;
> +		}
> +		rte_free(bc);
> +		bc = bn;
> +	}
> +	LIST_INIT(&ctx->nlbuf);
> +	return ret;
> +}
> +
> +/**
> + * Collect local IP address rules with scope link attribute  on specified
> + * network device. This is callback routine called by libmnl mnl_cb_run()
> + * in loop for every message in received packet.
> + *
> + * @param[in] nlh
> + *   Pointer to reply header.
> + * @param[in, out] arg
> + *   Opaque data pointer for this callback.
> + *
> + * @return
> + *   A positive, nonzero value on success, negative errno value otherwise
> + *   and rte_errno is set.
> + */
> +static int
> +flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
> +{
> +	struct tcf_nlcb_context *ctx = arg;
> +	struct nlmsghdr *cmd;
> +	struct ifaddrmsg *ifa;
> +	struct nlattr *na;
> +	struct nlattr *na_local = NULL;
> +	struct nlattr *na_peer = NULL;
> +	unsigned char family;
> +
> +	if (nlh->nlmsg_type != RTM_NEWADDR) {
> +		rte_errno = EINVAL;
> +		return -rte_errno;
> +	}
> +	ifa = mnl_nlmsg_get_payload(nlh);
> +	family = ifa->ifa_family;
> +	if (ifa->ifa_index != ctx->ifindex ||
> +	    ifa->ifa_scope != RT_SCOPE_LINK ||
> +	    !(ifa->ifa_flags & IFA_F_PERMANENT) ||
> +	    (family != AF_INET && family != AF_INET6))
> +		return 1;
> +	mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
> +		switch (mnl_attr_get_type(na)) {
> +		case IFA_LOCAL:
> +			na_local = na;
> +			break;
> +		case IFA_ADDRESS:
> +			na_peer = na;
> +			break;
> +		}
> +		if (na_local && na_peer)
> +			break;
> +	}
> +	if (!na_local || !na_peer)
> +		return 1;
> +	/* Local rule found with scope link, permanent and assigned peer. */
> +	cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
> +					MNL_ALIGN(sizeof(struct ifaddrmsg)) +
> +					(family == AF_INET6
> +					? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
> +					: 2 * SZ_NLATTR_TYPE_OF(uint32_t)));
> +	if (!cmd) {
> +		rte_errno = ENOMEM;
> +		return -rte_errno;
> +	}
> +	cmd = mnl_nlmsg_put_header(cmd);
> +	cmd->nlmsg_type = RTM_DELADDR;
> +	cmd->nlmsg_flags = NLM_F_REQUEST;
> +	ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
> +	ifa->ifa_flags = IFA_F_PERMANENT;
> +	ifa->ifa_scope = RT_SCOPE_LINK;
> +	ifa->ifa_index = ctx->ifindex;
> +	if (family == AF_INET) {
> +		ifa->ifa_family = AF_INET;
> +		ifa->ifa_prefixlen = 32;
> +		mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
> +		mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
> +	} else {
> +		ifa->ifa_family = AF_INET6;
> +		ifa->ifa_prefixlen = 128;
> +		mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
> +			mnl_attr_get_payload(na_local));
> +		mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
> +			mnl_attr_get_payload(na_peer));
> +	}
> +	return 1;
> +}
> +
> +/**
> + * Cleanup the local IP addresses on outer interface.
> + *
> + * @param[in] tcf
> + *   Context object initialized by mlx5_flow_tcf_context_create().
> + * @param[in] ifindex
> + *   Network inferface index to perform cleanup.
> + */
> +static void
> +flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
> +			    unsigned int ifindex)
> +{
> +	struct nlmsghdr *nlh;
> +	struct ifaddrmsg *ifa;
> +	struct tcf_nlcb_context ctx = {
> +		.ifindex = ifindex,
> +		.bufsize = MNL_REQUEST_SIZE,
> +		.nlbuf = LIST_HEAD_INITIALIZER(),
> +	};
> +	int ret;
> +
> +	assert(ifindex);
> +	/*
> +	 * Seek and destroy leftovers of local IP addresses with
> +	 * matching properties "scope link".
> +	 */
> +	nlh = mnl_nlmsg_put_header(tcf->buf);
> +	nlh->nlmsg_type = RTM_GETADDR;
> +	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
> +	ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
> +	ifa->ifa_family = AF_UNSPEC;
> +	ifa->ifa_index = ifindex;
> +	ifa->ifa_scope = RT_SCOPE_LINK;
> +	ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_local_cb, &ctx);
> +	if (ret)
> +		DRV_LOG(WARNING, "netlink: query device list error %d", ret);
> +	ret = flow_tcf_send_nlcmd(tcf, &ctx);
> +	if (ret)
> +		DRV_LOG(WARNING, "netlink: device delete error %d", ret);
> +}
> +
> +/**
> + * Collect neigh permament rules on specified network device.
> + * This is callback routine called by libmnl mnl_cb_run() in loop for
> + * every message in received packet.
> + *
> + * @param[in] nlh
> + *   Pointer to reply header.
> + * @param[in, out] arg
> + *   Opaque data pointer for this callback.
> + *
> + * @return
> + *   A positive, nonzero value on success, negative errno value otherwise
> + *   and rte_errno is set.
> + */
> +static int
> +flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
> +{
> +	struct tcf_nlcb_context *ctx = arg;
> +	struct nlmsghdr *cmd;
> +	struct ndmsg *ndm;
> +	struct nlattr *na;
> +	struct nlattr *na_ip = NULL;
> +	struct nlattr *na_mac = NULL;
> +	unsigned char family;
> +
> +	if (nlh->nlmsg_type != RTM_NEWNEIGH) {
> +		rte_errno = EINVAL;
> +		return -rte_errno;
> +	}
> +	ndm = mnl_nlmsg_get_payload(nlh);
> +	family = ndm->ndm_family;
> +	if (ndm->ndm_ifindex != (int)ctx->ifindex ||
> +	   !(ndm->ndm_state & NUD_PERMANENT) ||
> +	   (family != AF_INET && family != AF_INET6))
> +		return 1;
> +	mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
> +		switch (mnl_attr_get_type(na)) {
> +		case NDA_DST:
> +			na_ip = na;
> +			break;
> +		case NDA_LLADDR:
> +			na_mac = na;
> +			break;
> +		}
> +		if (na_mac && na_ip)
> +			break;
> +	}
> +	if (!na_mac || !na_ip)
> +		return 1;
> +	/* Neigh rule with permenent attribute found. */
> +	cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
> +					MNL_ALIGN(sizeof(struct ndmsg)) +
> +					SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
> +					(family == AF_INET6
> +					? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
> +					: SZ_NLATTR_TYPE_OF(uint32_t)));
> +	if (!cmd) {
> +		rte_errno = ENOMEM;
> +		return -rte_errno;
> +	}
> +	cmd = mnl_nlmsg_put_header(cmd);
> +	cmd->nlmsg_type = RTM_DELNEIGH;
> +	cmd->nlmsg_flags = NLM_F_REQUEST;
> +	ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
> +	ndm->ndm_ifindex = ctx->ifindex;
> +	ndm->ndm_state = NUD_PERMANENT;
> +	ndm->ndm_flags = 0;
> +	ndm->ndm_type = 0;
> +	if (family == AF_INET) {
> +		ndm->ndm_family = AF_INET;
> +		mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
> +	} else {
> +		ndm->ndm_family = AF_INET6;
> +		mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
> +			     mnl_attr_get_payload(na_ip));
> +	}
> +	mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
> +		     mnl_attr_get_payload(na_mac));
> +	return 1;
> +}
> +
> +/**
> + * Cleanup the neigh rules on outer interface.
> + *
> + * @param[in] tcf
> + *   Context object initialized by mlx5_flow_tcf_context_create().
> + * @param[in] ifindex
> + *   Network inferface index to perform cleanup.
> + */
> +static void
> +flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
> +			    unsigned int ifindex)
> +{
> +	struct nlmsghdr *nlh;
> +	struct ndmsg *ndm;
> +	struct tcf_nlcb_context ctx = {
> +		.ifindex = ifindex,
> +		.bufsize = MNL_REQUEST_SIZE,
> +		.nlbuf = LIST_HEAD_INITIALIZER(),
> +	};
> +	int ret;
> +
> +	assert(ifindex);
> +	/* Seek and destroy leftovers of neigh rules. */
> +	nlh = mnl_nlmsg_put_header(tcf->buf);
> +	nlh->nlmsg_type = RTM_GETNEIGH;
> +	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
> +	ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
> +	ndm->ndm_family = AF_UNSPEC;
> +	ndm->ndm_ifindex = ifindex;
> +	ndm->ndm_state = NUD_PERMANENT;
> +	ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_neigh_cb, &ctx);
> +	if (ret)
> +		DRV_LOG(WARNING, "netlink: query device list error %d", ret);
> +	ret = flow_tcf_send_nlcmd(tcf, &ctx);
> +	if (ret)
> +		DRV_LOG(WARNING, "netlink: device delete error %d", ret);
> +}
> +
> +/**
> + * Collect indices of VXLAN encap/decap interfaces associated with device.
> + * This is callback routine called by libmnl mnl_cb_run() in loop for
> + * every message in received packet.
> + *
> + * @param[in] nlh
> + *   Pointer to reply header.
> + * @param[in, out] arg
> + *   Opaque data pointer for this callback.
> + *
> + * @return
> + *   A positive, nonzero value on success, negative errno value otherwise
> + *   and rte_errno is set.
> + */
> +static int
> +flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
> +{
> +	struct tcf_nlcb_context *ctx = arg;
> +	struct nlmsghdr *cmd;
> +	struct ifinfomsg *ifm;
> +	struct nlattr *na;
> +	struct nlattr *na_info = NULL;
> +	struct nlattr *na_vxlan = NULL;
> +	bool found = false;
> +	unsigned int vxindex;
> +
> +	if (nlh->nlmsg_type != RTM_NEWLINK) {
> +		rte_errno = EINVAL;
> +		return -rte_errno;
> +	}
> +	ifm = mnl_nlmsg_get_payload(nlh);
> +	if (!ifm->ifi_index) {
> +		rte_errno = EINVAL;
> +		return -rte_errno;
> +	}
> +	mnl_attr_for_each(na, nlh, sizeof(*ifm))
> +		if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
> +			na_info = na;
> +			break;
> +		}
> +	if (!na_info)
> +		return 1;
> +	mnl_attr_for_each_nested(na, na_info) {
> +		switch (mnl_attr_get_type(na)) {
> +		case IFLA_INFO_KIND:
> +			if (!strncmp("vxlan", mnl_attr_get_str(na),
> +				     mnl_attr_get_len(na)))
> +				found = true;
> +			break;
> +		case IFLA_INFO_DATA:
> +			na_vxlan = na;
> +			break;
> +		}
> +		if (found && na_vxlan)
> +			break;
> +	}
> +	if (!found || !na_vxlan)
> +		return 1;
> +	found = false;
> +	mnl_attr_for_each_nested(na, na_vxlan) {
> +		if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
> +		    mnl_attr_get_u32(na) == ctx->ifindex) {
> +			found = true;
> +			break;
> +		}
> +	}
> +	if (!found)
> +		return 1;
> +	/* Attached VXLAN device found, store the command to delete. */
> +	vxindex = ifm->ifi_index;
> +	cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
> +					MNL_ALIGN(sizeof(struct ifinfomsg)));
> +	if (!nlh) {
> +		rte_errno = ENOMEM;
> +		return -rte_errno;
> +	}
> +	cmd = mnl_nlmsg_put_header(cmd);
> +	cmd->nlmsg_type = RTM_DELLINK;
> +	cmd->nlmsg_flags = NLM_F_REQUEST;
> +	ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
> +	ifm->ifi_family = AF_UNSPEC;
> +	ifm->ifi_index = vxindex;
> +	return 1;
> +}
> +
> +/**
> + * Cleanup the outer interface. Removes all found vxlan devices
> + * attached to specified index, flushes the meigh and local IP
> + * datavase.
> + *
> + * @param[in] tcf
> + *   Context object initialized by mlx5_flow_tcf_context_create().
> + * @param[in] ifindex
> + *   Network inferface index to perform cleanup.
> + */
> +static void
> +flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
> +			    unsigned int ifindex)
> +{
> +	struct nlmsghdr *nlh;
> +	struct ifinfomsg *ifm;
> +	struct tcf_nlcb_context ctx = {
> +		.ifindex = ifindex,
> +		.bufsize = MNL_REQUEST_SIZE,
> +		.nlbuf = LIST_HEAD_INITIALIZER(),
> +	};
> +	int ret;
> +
> +	assert(ifindex);
> +	/*
> +	 * Seek and destroy leftover VXLAN encap/decap interfaces with
> +	 * matching properties.
> +	 */
> +	nlh = mnl_nlmsg_put_header(tcf->buf);
> +	nlh->nlmsg_type = RTM_GETLINK;
> +	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
> +	ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
> +	ifm->ifi_family = AF_UNSPEC;
> +	ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_vxlan_cb, &ctx);
> +	if (ret)
> +		DRV_LOG(WARNING, "netlink: query device list error %d", ret);
> +	ret = flow_tcf_send_nlcmd(tcf, &ctx);
> +	if (ret)
> +		DRV_LOG(WARNING, "netlink: device delete error %d", ret);
> +}
> +
>  /**
>   * Emit Netlink message to add/remove local address to the outer device.
>   * The address being added is visible within the link only (scope link).
> @@ -4325,7 +4815,7 @@ struct pedit_parser {
>  	rte_flow_error_set(error, ENOTSUP,
>  			 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
>  			 "netlink: failed to create VTEP, "
> -			 "VXLAN metadata are not supported by kernel");
> +			 "vxlan metadata are not supported by kernel");
>  	return NULL;
>  }
>  #endif /* HAVE_IFLA_VXLAN_COLLECT_METADATA */
> @@ -4415,6 +4905,9 @@ struct pedit_parser {
>  		uint16_t pcnt;
>  
>  		/* Not found, we should create the new attached VTEP. */
> +		flow_tcf_encap_iface_cleanup(tcf, ifouter);
> +		flow_tcf_encap_local_cleanup(tcf, ifouter);
> +		flow_tcf_encap_neigh_cleanup(tcf, ifouter);
>  		for (pcnt = 0; pcnt <= (MLX5_VXLAN_PORT_MAX
>  				     - MLX5_VXLAN_PORT_MIN); pcnt++) {
>  			encap_port++;
> -- 
> 1.8.3.1
>
diff mbox series

Patch

diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c
index eae80ae..420816f 100644
--- a/drivers/net/mlx5/mlx5_flow_tcf.c
+++ b/drivers/net/mlx5/mlx5_flow_tcf.c
@@ -3756,6 +3756,496 @@  struct pedit_parser {
 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
 				 MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
 
+/* Data structures used by flow_tcf_xxx_cb() routines. */
+struct tcf_nlcb_buf {
+	LIST_ENTRY(tcf_nlcb_buf) next;
+	uint32_t size;
+	alignas(struct nlmsghdr)
+	uint8_t msg[]; /**< Netlink message data. */
+};
+
+struct tcf_nlcb_context {
+	unsigned int ifindex; /**< Base interface index. */
+	uint32_t bufsize;
+	LIST_HEAD(, tcf_nlcb_buf) nlbuf;
+};
+
+/**
+ * Allocate space for netlink command in buffer list
+ *
+ * @param[in, out] ctx
+ *   Pointer to callback context with command buffers list.
+ * @param[in] size
+ *   Required size of data buffer to be allocated.
+ *
+ * @return
+ *   Pointer to allocated memory, aligned as message header.
+ *   NULL if some error occurred.
+ */
+static struct nlmsghdr *
+flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
+{
+	struct tcf_nlcb_buf *buf;
+	struct nlmsghdr *nlh;
+
+	size = NLMSG_ALIGN(size);
+	buf = LIST_FIRST(&ctx->nlbuf);
+	if (buf && (buf->size + size) <= ctx->bufsize) {
+		nlh = (struct nlmsghdr *)&buf->msg[buf->size];
+		buf->size += size;
+		return nlh;
+	}
+	if (size > ctx->bufsize) {
+		DRV_LOG(WARNING, "netlink: too long command buffer requested");
+		return NULL;
+	}
+	buf = rte_malloc(__func__,
+			ctx->bufsize + sizeof(struct tcf_nlcb_buf),
+			alignof(struct tcf_nlcb_buf));
+	if (!buf) {
+		DRV_LOG(WARNING, "netlink: no memory for command buffer");
+		return NULL;
+	}
+	LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
+	buf->size = size;
+	nlh = (struct nlmsghdr *)&buf->msg[0];
+	return nlh;
+}
+
+/**
+ * Set NLM_F_ACK flags in the last netlink command in buffer.
+ * Only last command in the buffer will be acked by system.
+ *
+ * @param[in, out] buf
+ *   Pointer to buffer with netlink commands.
+ */
+static void
+flow_tcf_setack_nlcmd(struct tcf_nlcb_buf *buf)
+{
+	struct nlmsghdr *nlh;
+	uint32_t size = 0;
+
+	assert(buf->size);
+	do {
+		nlh = (struct nlmsghdr *)&buf->msg[size];
+		size += NLMSG_ALIGN(nlh->nlmsg_len);
+		if (size >= buf->size) {
+			nlh->nlmsg_flags |= NLM_F_ACK;
+			break;
+		}
+	} while (true);
+}
+
+/**
+ * Send the buffers with prepared netlink commands. Scans the list and
+ * sends all found buffers. Buffers are sent and freed anyway in order
+ * to prevent memory leakage if some every message in received packet.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_flow_tcf_context_create().
+ * @param[in, out] ctx
+ *   Pointer to callback context with command buffers list.
+ *
+ * @return
+ *   Zero value on success, negative errno value otherwise
+ *   and rte_errno is set.
+ */
+static int
+flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
+		    struct tcf_nlcb_context *ctx)
+{
+	struct tcf_nlcb_buf *bc, *bn;
+	struct nlmsghdr *nlh;
+	int ret = 0;
+
+	bc = LIST_FIRST(&ctx->nlbuf);
+	while (bc) {
+		int rc;
+
+		bn = LIST_NEXT(bc, next);
+		if (bc->size) {
+			flow_tcf_setack_nlcmd(bc);
+			nlh = (struct nlmsghdr *)&bc->msg;
+			rc = flow_tcf_nl_ack(tcf, nlh, bc->size, NULL, NULL);
+			if (rc && !ret)
+				ret = rc;
+		}
+		rte_free(bc);
+		bc = bn;
+	}
+	LIST_INIT(&ctx->nlbuf);
+	return ret;
+}
+
+/**
+ * Collect local IP address rules with scope link attribute  on specified
+ * network device. This is callback routine called by libmnl mnl_cb_run()
+ * in loop for every message in received packet.
+ *
+ * @param[in] nlh
+ *   Pointer to reply header.
+ * @param[in, out] arg
+ *   Opaque data pointer for this callback.
+ *
+ * @return
+ *   A positive, nonzero value on success, negative errno value otherwise
+ *   and rte_errno is set.
+ */
+static int
+flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
+{
+	struct tcf_nlcb_context *ctx = arg;
+	struct nlmsghdr *cmd;
+	struct ifaddrmsg *ifa;
+	struct nlattr *na;
+	struct nlattr *na_local = NULL;
+	struct nlattr *na_peer = NULL;
+	unsigned char family;
+
+	if (nlh->nlmsg_type != RTM_NEWADDR) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	ifa = mnl_nlmsg_get_payload(nlh);
+	family = ifa->ifa_family;
+	if (ifa->ifa_index != ctx->ifindex ||
+	    ifa->ifa_scope != RT_SCOPE_LINK ||
+	    !(ifa->ifa_flags & IFA_F_PERMANENT) ||
+	    (family != AF_INET && family != AF_INET6))
+		return 1;
+	mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
+		switch (mnl_attr_get_type(na)) {
+		case IFA_LOCAL:
+			na_local = na;
+			break;
+		case IFA_ADDRESS:
+			na_peer = na;
+			break;
+		}
+		if (na_local && na_peer)
+			break;
+	}
+	if (!na_local || !na_peer)
+		return 1;
+	/* Local rule found with scope link, permanent and assigned peer. */
+	cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
+					MNL_ALIGN(sizeof(struct ifaddrmsg)) +
+					(family == AF_INET6
+					? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
+					: 2 * SZ_NLATTR_TYPE_OF(uint32_t)));
+	if (!cmd) {
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	cmd = mnl_nlmsg_put_header(cmd);
+	cmd->nlmsg_type = RTM_DELADDR;
+	cmd->nlmsg_flags = NLM_F_REQUEST;
+	ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
+	ifa->ifa_flags = IFA_F_PERMANENT;
+	ifa->ifa_scope = RT_SCOPE_LINK;
+	ifa->ifa_index = ctx->ifindex;
+	if (family == AF_INET) {
+		ifa->ifa_family = AF_INET;
+		ifa->ifa_prefixlen = 32;
+		mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
+		mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
+	} else {
+		ifa->ifa_family = AF_INET6;
+		ifa->ifa_prefixlen = 128;
+		mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
+			mnl_attr_get_payload(na_local));
+		mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
+			mnl_attr_get_payload(na_peer));
+	}
+	return 1;
+}
+
+/**
+ * Cleanup the local IP addresses on outer interface.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_flow_tcf_context_create().
+ * @param[in] ifindex
+ *   Network inferface index to perform cleanup.
+ */
+static void
+flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
+			    unsigned int ifindex)
+{
+	struct nlmsghdr *nlh;
+	struct ifaddrmsg *ifa;
+	struct tcf_nlcb_context ctx = {
+		.ifindex = ifindex,
+		.bufsize = MNL_REQUEST_SIZE,
+		.nlbuf = LIST_HEAD_INITIALIZER(),
+	};
+	int ret;
+
+	assert(ifindex);
+	/*
+	 * Seek and destroy leftovers of local IP addresses with
+	 * matching properties "scope link".
+	 */
+	nlh = mnl_nlmsg_put_header(tcf->buf);
+	nlh->nlmsg_type = RTM_GETADDR;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+	ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
+	ifa->ifa_family = AF_UNSPEC;
+	ifa->ifa_index = ifindex;
+	ifa->ifa_scope = RT_SCOPE_LINK;
+	ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_local_cb, &ctx);
+	if (ret)
+		DRV_LOG(WARNING, "netlink: query device list error %d", ret);
+	ret = flow_tcf_send_nlcmd(tcf, &ctx);
+	if (ret)
+		DRV_LOG(WARNING, "netlink: device delete error %d", ret);
+}
+
+/**
+ * Collect neigh permament rules on specified network device.
+ * This is callback routine called by libmnl mnl_cb_run() in loop for
+ * every message in received packet.
+ *
+ * @param[in] nlh
+ *   Pointer to reply header.
+ * @param[in, out] arg
+ *   Opaque data pointer for this callback.
+ *
+ * @return
+ *   A positive, nonzero value on success, negative errno value otherwise
+ *   and rte_errno is set.
+ */
+static int
+flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
+{
+	struct tcf_nlcb_context *ctx = arg;
+	struct nlmsghdr *cmd;
+	struct ndmsg *ndm;
+	struct nlattr *na;
+	struct nlattr *na_ip = NULL;
+	struct nlattr *na_mac = NULL;
+	unsigned char family;
+
+	if (nlh->nlmsg_type != RTM_NEWNEIGH) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	ndm = mnl_nlmsg_get_payload(nlh);
+	family = ndm->ndm_family;
+	if (ndm->ndm_ifindex != (int)ctx->ifindex ||
+	   !(ndm->ndm_state & NUD_PERMANENT) ||
+	   (family != AF_INET && family != AF_INET6))
+		return 1;
+	mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
+		switch (mnl_attr_get_type(na)) {
+		case NDA_DST:
+			na_ip = na;
+			break;
+		case NDA_LLADDR:
+			na_mac = na;
+			break;
+		}
+		if (na_mac && na_ip)
+			break;
+	}
+	if (!na_mac || !na_ip)
+		return 1;
+	/* Neigh rule with permenent attribute found. */
+	cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
+					MNL_ALIGN(sizeof(struct ndmsg)) +
+					SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
+					(family == AF_INET6
+					? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
+					: SZ_NLATTR_TYPE_OF(uint32_t)));
+	if (!cmd) {
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	cmd = mnl_nlmsg_put_header(cmd);
+	cmd->nlmsg_type = RTM_DELNEIGH;
+	cmd->nlmsg_flags = NLM_F_REQUEST;
+	ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
+	ndm->ndm_ifindex = ctx->ifindex;
+	ndm->ndm_state = NUD_PERMANENT;
+	ndm->ndm_flags = 0;
+	ndm->ndm_type = 0;
+	if (family == AF_INET) {
+		ndm->ndm_family = AF_INET;
+		mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
+	} else {
+		ndm->ndm_family = AF_INET6;
+		mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
+			     mnl_attr_get_payload(na_ip));
+	}
+	mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
+		     mnl_attr_get_payload(na_mac));
+	return 1;
+}
+
+/**
+ * Cleanup the neigh rules on outer interface.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_flow_tcf_context_create().
+ * @param[in] ifindex
+ *   Network inferface index to perform cleanup.
+ */
+static void
+flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
+			    unsigned int ifindex)
+{
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+	struct tcf_nlcb_context ctx = {
+		.ifindex = ifindex,
+		.bufsize = MNL_REQUEST_SIZE,
+		.nlbuf = LIST_HEAD_INITIALIZER(),
+	};
+	int ret;
+
+	assert(ifindex);
+	/* Seek and destroy leftovers of neigh rules. */
+	nlh = mnl_nlmsg_put_header(tcf->buf);
+	nlh->nlmsg_type = RTM_GETNEIGH;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+	ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
+	ndm->ndm_family = AF_UNSPEC;
+	ndm->ndm_ifindex = ifindex;
+	ndm->ndm_state = NUD_PERMANENT;
+	ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_neigh_cb, &ctx);
+	if (ret)
+		DRV_LOG(WARNING, "netlink: query device list error %d", ret);
+	ret = flow_tcf_send_nlcmd(tcf, &ctx);
+	if (ret)
+		DRV_LOG(WARNING, "netlink: device delete error %d", ret);
+}
+
+/**
+ * Collect indices of VXLAN encap/decap interfaces associated with device.
+ * This is callback routine called by libmnl mnl_cb_run() in loop for
+ * every message in received packet.
+ *
+ * @param[in] nlh
+ *   Pointer to reply header.
+ * @param[in, out] arg
+ *   Opaque data pointer for this callback.
+ *
+ * @return
+ *   A positive, nonzero value on success, negative errno value otherwise
+ *   and rte_errno is set.
+ */
+static int
+flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
+{
+	struct tcf_nlcb_context *ctx = arg;
+	struct nlmsghdr *cmd;
+	struct ifinfomsg *ifm;
+	struct nlattr *na;
+	struct nlattr *na_info = NULL;
+	struct nlattr *na_vxlan = NULL;
+	bool found = false;
+	unsigned int vxindex;
+
+	if (nlh->nlmsg_type != RTM_NEWLINK) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	ifm = mnl_nlmsg_get_payload(nlh);
+	if (!ifm->ifi_index) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	mnl_attr_for_each(na, nlh, sizeof(*ifm))
+		if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
+			na_info = na;
+			break;
+		}
+	if (!na_info)
+		return 1;
+	mnl_attr_for_each_nested(na, na_info) {
+		switch (mnl_attr_get_type(na)) {
+		case IFLA_INFO_KIND:
+			if (!strncmp("vxlan", mnl_attr_get_str(na),
+				     mnl_attr_get_len(na)))
+				found = true;
+			break;
+		case IFLA_INFO_DATA:
+			na_vxlan = na;
+			break;
+		}
+		if (found && na_vxlan)
+			break;
+	}
+	if (!found || !na_vxlan)
+		return 1;
+	found = false;
+	mnl_attr_for_each_nested(na, na_vxlan) {
+		if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
+		    mnl_attr_get_u32(na) == ctx->ifindex) {
+			found = true;
+			break;
+		}
+	}
+	if (!found)
+		return 1;
+	/* Attached VXLAN device found, store the command to delete. */
+	vxindex = ifm->ifi_index;
+	cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
+					MNL_ALIGN(sizeof(struct ifinfomsg)));
+	if (!nlh) {
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	cmd = mnl_nlmsg_put_header(cmd);
+	cmd->nlmsg_type = RTM_DELLINK;
+	cmd->nlmsg_flags = NLM_F_REQUEST;
+	ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
+	ifm->ifi_family = AF_UNSPEC;
+	ifm->ifi_index = vxindex;
+	return 1;
+}
+
+/**
+ * Cleanup the outer interface. Removes all found vxlan devices
+ * attached to specified index, flushes the meigh and local IP
+ * datavase.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_flow_tcf_context_create().
+ * @param[in] ifindex
+ *   Network inferface index to perform cleanup.
+ */
+static void
+flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
+			    unsigned int ifindex)
+{
+	struct nlmsghdr *nlh;
+	struct ifinfomsg *ifm;
+	struct tcf_nlcb_context ctx = {
+		.ifindex = ifindex,
+		.bufsize = MNL_REQUEST_SIZE,
+		.nlbuf = LIST_HEAD_INITIALIZER(),
+	};
+	int ret;
+
+	assert(ifindex);
+	/*
+	 * Seek and destroy leftover VXLAN encap/decap interfaces with
+	 * matching properties.
+	 */
+	nlh = mnl_nlmsg_put_header(tcf->buf);
+	nlh->nlmsg_type = RTM_GETLINK;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+	ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+	ifm->ifi_family = AF_UNSPEC;
+	ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_vxlan_cb, &ctx);
+	if (ret)
+		DRV_LOG(WARNING, "netlink: query device list error %d", ret);
+	ret = flow_tcf_send_nlcmd(tcf, &ctx);
+	if (ret)
+		DRV_LOG(WARNING, "netlink: device delete error %d", ret);
+}
+
 /**
  * Emit Netlink message to add/remove local address to the outer device.
  * The address being added is visible within the link only (scope link).
@@ -4325,7 +4815,7 @@  struct pedit_parser {
 	rte_flow_error_set(error, ENOTSUP,
 			 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
 			 "netlink: failed to create VTEP, "
-			 "VXLAN metadata are not supported by kernel");
+			 "vxlan metadata are not supported by kernel");
 	return NULL;
 }
 #endif /* HAVE_IFLA_VXLAN_COLLECT_METADATA */
@@ -4415,6 +4905,9 @@  struct pedit_parser {
 		uint16_t pcnt;
 
 		/* Not found, we should create the new attached VTEP. */
+		flow_tcf_encap_iface_cleanup(tcf, ifouter);
+		flow_tcf_encap_local_cleanup(tcf, ifouter);
+		flow_tcf_encap_neigh_cleanup(tcf, ifouter);
 		for (pcnt = 0; pcnt <= (MLX5_VXLAN_PORT_MAX
 				     - MLX5_VXLAN_PORT_MIN); pcnt++) {
 			encap_port++;