diff mbox series

[v2,6/7] net/mlx5: e-switch VXLAN encapsulation rules management

Message ID 1539612815-47199-7-git-send-email-viacheslavo@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Shahaf Shuler
Headers show
Series net/mlx5: e-switch VXLAN encap/decap hardware offload | expand

Checks

Context Check Description
ci/Intel-compilation success Compilation OK
ci/checkpatch success coding style OK

Commit Message

Slava Ovsiienko Oct. 15, 2018, 2:13 p.m. UTC
VXLAN encap rules are applied to the VF ingress traffic and have the
VTEP as actual redirection destinations instead of outer PF.
The encapsulation rule should provide:
- redirection action VF->PF
- VF port ID
- some inner network parameters (MACs/IP)
- the tunnel outer source IP (v4/v6)
- the tunnel outer destination IP (v4/v6). Current
- VNI - Virtual Network Identifier

There is no direct way found to provide kernel with all required
encapsulatioh header parameters. The encapsulation VTEP is created
attached to the outer interface and assumed as default path for
egress encapsulated traffic. The outer tunnel IP address are
assigned to interface using Netlink, the implicit route is
created like this:

  ip addr add <src_ip> peer <dst_ip> dev <outer> scope link

Peer address provides implicit route, and scode link reduces
the risk of conflicts. At initialization time all local scope
link addresses are flushed from device (see next part of patchset).

The destination MAC address is provided via permenent neigh rule:

  ip neigh add dev <outer> lladdr <dst_mac> to <dst_ip> nud permanent

At initialization time all neigh rules of this type are flushed
from device (see the next part of patchset).

Suggested-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_flow_tcf.c | 394 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 389 insertions(+), 5 deletions(-)

Comments

Yongseok Koh Oct. 25, 2018, 12:33 a.m. UTC | #1
On Mon, Oct 15, 2018 at 02:13:34PM +0000, Viacheslav Ovsiienko wrote:
> VXLAN encap rules are applied to the VF ingress traffic and have the
> VTEP as actual redirection destinations instead of outer PF.
> The encapsulation rule should provide:
> - redirection action VF->PF
> - VF port ID
> - some inner network parameters (MACs/IP)
> - the tunnel outer source IP (v4/v6)
> - the tunnel outer destination IP (v4/v6). Current
> - VNI - Virtual Network Identifier
> 
> There is no direct way found to provide kernel with all required
> encapsulatioh header parameters. The encapsulation VTEP is created
> attached to the outer interface and assumed as default path for
> egress encapsulated traffic. The outer tunnel IP address are
> assigned to interface using Netlink, the implicit route is
> created like this:
> 
>   ip addr add <src_ip> peer <dst_ip> dev <outer> scope link
> 
> Peer address provides implicit route, and scode link reduces
> the risk of conflicts. At initialization time all local scope
> link addresses are flushed from device (see next part of patchset).
> 
> The destination MAC address is provided via permenent neigh rule:
> 
>   ip neigh add dev <outer> lladdr <dst_mac> to <dst_ip> nud permanent
> 
> At initialization time all neigh rules of this type are flushed
> from device (see the next part of patchset).
> 
> Suggested-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> ---
>  drivers/net/mlx5/mlx5_flow_tcf.c | 394 ++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 389 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c
> index efa9c3b..a1d7733 100644
> --- a/drivers/net/mlx5/mlx5_flow_tcf.c
> +++ b/drivers/net/mlx5/mlx5_flow_tcf.c
> @@ -3443,6 +3443,376 @@ struct pedit_parser {
>  	return -err;
>  }
>  
> +/**
> + * Emit Netlink message to add/remove local address to the outer device.
> + * The address being added is visible within the link only (scope link).
> + *
> + * Note that an implicit route is maintained by the kernel due to the
> + * presence of a peer address (IFA_ADDRESS).
> + *
> + * These rules are used for encapsultion only and allow to assign
> + * the outer tunnel source IP address.
> + *
> + * @param[in] tcf
> + *   Libmnl socket context object.
> + * @param[in] encap
> + *   Encapsulation properties (source address and its peer).
> + * @param[in] ifindex
> + *   Network interface to apply rule.
> + * @param[in] enable
> + *   Toggle between add and remove.
> + * @param[out] error
> + *   Perform verbose error reporting if not NULL.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
> +		    const struct mlx5_flow_tcf_vxlan_encap *encap,
> +		    unsigned int ifindex,
> +		    bool enable,
> +		    struct rte_flow_error *error)
> +{
> +	struct nlmsghdr *nlh;
> +	struct ifaddrmsg *ifa;
> +	alignas(struct nlmsghdr)
> +	uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
> +
> +	nlh = mnl_nlmsg_put_header(buf);
> +	nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
> +	nlh->nlmsg_flags =
> +		NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
> +	nlh->nlmsg_seq = 0;
> +	ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
> +	ifa->ifa_flags = IFA_F_PERMANENT;
> +	ifa->ifa_scope = RT_SCOPE_LINK;
> +	ifa->ifa_index = ifindex;
> +	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC) {
> +		ifa->ifa_family = AF_INET;
> +		ifa->ifa_prefixlen = 32;
> +		mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
> +		if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST)
> +			mnl_attr_put_u32(nlh, IFA_ADDRESS,
> +					      encap->ipv4.dst);
> +	} else {
> +		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC);
> +		ifa->ifa_family = AF_INET6;
> +		ifa->ifa_prefixlen = 128;
> +		mnl_attr_put(nlh, IFA_LOCAL,
> +				  sizeof(encap->ipv6.src),
> +				  &encap->ipv6.src);
> +		if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST)
> +			mnl_attr_put(nlh, IFA_ADDRESS,
> +					  sizeof(encap->ipv6.dst),
> +					  &encap->ipv6.dst);
> +	}
> +	if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL))
> +		return 0;
> +	return rte_flow_error_set
> +		(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
> +		 "netlink: cannot complete IFA request (ip addr add)");
> +}
> +
> +/**
> + * Emit Netlink message to add/remove neighbor.
> + *
> + * @param[in] tcf
> + *   Libmnl socket context object.
> + * @param[in] encap
> + *   Encapsulation properties (destination address).
> + * @param[in] ifindex
> + *   Network interface.
> + * @param[in] enable
> + *   Toggle between add and remove.
> + * @param[out] error
> + *   Perform verbose error reporting if not NULL.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
> +		     const struct mlx5_flow_tcf_vxlan_encap *encap,
> +		     unsigned int ifindex,
> +		     bool enable,
> +		     struct rte_flow_error *error)
> +{
> +	struct nlmsghdr *nlh;
> +	struct ndmsg *ndm;
> +	alignas(struct nlmsghdr)
> +	uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
> +
> +	nlh = mnl_nlmsg_put_header(buf);
> +	nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
> +	nlh->nlmsg_flags =
> +		NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
> +	nlh->nlmsg_seq = 0;
> +	ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
> +	ndm->ndm_ifindex = ifindex;
> +	ndm->ndm_state = NUD_PERMANENT;
> +	ndm->ndm_flags = 0;
> +	ndm->ndm_type = 0;
> +	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST) {
> +		ndm->ndm_family = AF_INET;
> +		mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
> +	} else {
> +		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST);
> +		ndm->ndm_family = AF_INET6;
> +		mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
> +						 &encap->ipv6.dst);
> +	}
> +	if (encap->mask & MLX5_FLOW_TCF_ENCAP_ETH_SRC && enable)
> +		DRV_LOG(WARNING,
> +			"Outer ethernet source address cannot be "
> +			"forced for VXLAN encapsulation");
> +	if (encap->mask & MLX5_FLOW_TCF_ENCAP_ETH_DST)
> +		mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
> +						    &encap->eth.dst);
> +	if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL))
> +		return 0;
> +	return rte_flow_error_set
> +		(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
> +		 "netlink: cannot complete ND request (ip neigh)");
> +}
> +
> +/**
> + * Manage the local IP addresses and their peers IP addresses on the
> + * outer interface for encapsulation purposes. The kernel searches the
> + * appropriate device for tunnel egress traffic using the outer source
> + * IP, this IP should be assigned to the outer network device, otherwise
> + * kernel rejects the rule.
> + *
> + * Adds or removes the addresses using the Netlink command like this:
> + *   ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
> + *
> + * The addresses are local to the netdev ("scope link"), this reduces
> + * the risk of conflicts. Note that an implicit route is maintained by
> + * the kernel due to the presence of a peer address (IFA_ADDRESS).
> + *
> + * @param[in] tcf
> + *   Libmnl socket context object.
> + * @param[in] vtep
> + *   VTEP object, contains rule database and ifouter index.
> + * @param[in] dev_flow
> + *   Flow object, contains the tunnel parameters (for encap only).
> + * @param[in] enable
> + *   Toggle between add and remove.
> + * @param[out] error
> + *   Perform verbose error reporting if not NULL.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
> +		     struct mlx5_flow_tcf_vtep *vtep,
> +		     struct mlx5_flow *dev_flow,
> +		     bool enable,
> +		     struct rte_flow_error *error)
> +{
> +	const struct mlx5_flow_tcf_vxlan_encap *encap =
> +						dev_flow->tcf.vxlan_encap;
> +	struct tcf_local_rule *rule;
> +	bool found = false;
> +	int ret;
> +
> +	assert(encap);
> +	assert(encap->hdr.type == MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP);
> +	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC) {
> +		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST);
> +		LIST_FOREACH(rule, &vtep->local, next) {
> +			if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC &&
> +			    encap->ipv4.src == rule->ipv4.src &&
> +			    encap->ipv4.dst == rule->ipv4.dst) {
> +				found = true;
> +				break;
> +			}
> +		}
> +	} else {
> +		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC);
> +		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST);
> +		LIST_FOREACH(rule, &vtep->local, next) {
> +			if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC &&
> +			    !memcmp(&encap->ipv6.src, &rule->ipv6.src,
> +					    sizeof(encap->ipv6.src)) &&
> +			    !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
> +					    sizeof(encap->ipv6.dst))) {
> +				found = true;
> +				break;
> +			}
> +		}
> +	}
> +	if (found) {
> +		if (enable) {
> +			rule->refcnt++;
> +			return 0;
> +		}
> +		if (!rule->refcnt || !--rule->refcnt) {

Same suggestion for this as that of vtep - refcnt handling and adding get()
func.

> +			LIST_REMOVE(rule, next);
> +			return flow_tcf_rule_local(tcf, encap,
> +					vtep->ifouter, false, error);
> +		}
> +		return 0;
> +	}
> +	if (!enable) {
> +		DRV_LOG(WARNING, "Disabling not existing local rule");
> +		rte_flow_error_set
> +			(error, ENOENT, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +			 NULL, "Disabling not existing local rule");
> +		return -ENOENT;
> +	}
> +	rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
> +				alignof(struct tcf_local_rule));
> +	if (!rule) {
> +		rte_flow_error_set
> +			(error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +			 NULL, "unable to allocate memory for local rule");
> +		return -rte_errno;
> +	}
> +	*rule = (struct tcf_local_rule){.refcnt = 0,
> +					.mask = 0,
> +					};

Is it effective? The allocated memory is already zeroed out.

> +	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC) {
> +		rule->mask = MLX5_FLOW_TCF_ENCAP_IPV4_SRC
> +			   | MLX5_FLOW_TCF_ENCAP_IPV4_DST;
> +		rule->ipv4.src = encap->ipv4.src;
> +		rule->ipv4.dst = encap->ipv4.dst;
> +	} else {
> +		rule->mask = MLX5_FLOW_TCF_ENCAP_IPV6_SRC
> +			   | MLX5_FLOW_TCF_ENCAP_IPV6_DST;
> +		memcpy(&rule->ipv6.src, &encap->ipv6.src,
> +				sizeof(rule->ipv6.src));
> +		memcpy(&rule->ipv6.dst, &encap->ipv6.dst,
> +				sizeof(rule->ipv6.dst));
> +	}
> +	ret = flow_tcf_rule_local(tcf, encap, vtep->ifouter, true, error);
> +	if (ret) {
> +		rte_free(rule);
> +		return ret;
> +	}
> +	rule->refcnt++;
> +	LIST_INSERT_HEAD(&vtep->local, rule, next);
> +	return 0;
> +}
> +
> +/**
> + * Manage the destination MAC/IP addresses neigh database, kernel uses
> + * this one to determine the destination MAC address within encapsulation
> + * header. Adds or removes the entries using the Netlink command like this:
> + *   ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
> + *
> + * @param[in] tcf
> + *   Libmnl socket context object.
> + * @param[in] vtep
> + *   VTEP object, contains rule database and ifouter index.
> + * @param[in] dev_flow
> + *   Flow object, contains the tunnel parameters (for encap only).
> + * @param[in] enable
> + *   Toggle between add and remove.
> + * @param[out] error
> + *   Perform verbose error reporting if not NULL.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
> +		     struct mlx5_flow_tcf_vtep *vtep,
> +		     struct mlx5_flow *dev_flow,
> +		     bool enable,
> +		     struct rte_flow_error *error)
> +{
> +	const struct mlx5_flow_tcf_vxlan_encap *encap =
> +						dev_flow->tcf.vxlan_encap;
> +	struct tcf_neigh_rule *rule;
> +	bool found = false;
> +	int ret;
> +
> +	assert(encap);
> +	assert(encap->hdr.type == MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP);
> +	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST) {
> +		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC);
> +		LIST_FOREACH(rule, &vtep->neigh, next) {
> +			if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST &&
> +			    encap->ipv4.dst == rule->ipv4.dst) {
> +				found = true;
> +				break;
> +			}
> +		}
> +	} else {
> +		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC);
> +		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST);
> +		LIST_FOREACH(rule, &vtep->neigh, next) {
> +			if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST &&
> +			    !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
> +						sizeof(encap->ipv6.dst))) {
> +				found = true;
> +				break;
> +			}
> +		}
> +	}
> +	if (found) {
> +		if (memcmp(&encap->eth.dst, &rule->eth,
> +			   sizeof(encap->eth.dst))) {
> +			DRV_LOG(WARNING, "Destination MAC differs"
> +					 " in neigh rule");
> +			rte_flow_error_set(error, EEXIST,
> +					   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +					   NULL, "Different MAC address"
> +					   " neigh rule for the same"
> +					   " destination IP");
> +					return -EEXIST;
> +		}
> +		if (enable) {
> +			rule->refcnt++;
> +			return 0;
> +		}
> +		if (!rule->refcnt || !--rule->refcnt) {

Same suggestion for this as that of vtep - refcnt handling by adding
create()/get()/release() func.

> +			LIST_REMOVE(rule, next);
> +			return flow_tcf_rule_neigh(tcf, encap,
> +						   vtep->ifouter,
> +						   false, error);
> +		}
> +		return 0;
> +	}
> +	if (!enable) {
> +		DRV_LOG(WARNING, "Disabling not existing neigh rule");
> +		rte_flow_error_set
> +			(error, ENOENT, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +			 NULL, "unable to allocate memory for neigh rule");
> +		return -ENOENT;
> +	}
> +	rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
> +				alignof(struct tcf_neigh_rule));
> +	if (!rule) {
> +		rte_flow_error_set
> +			(error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +			 NULL, "unadble to allocate memory for neigh rule");
> +		return -rte_errno;
> +	}
> +	*rule = (struct tcf_neigh_rule){.refcnt = 0,
> +					.mask = 0,
> +					};

Is it effective? The allocated memory is already zeroed out.

> +	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST) {
> +		rule->mask = MLX5_FLOW_TCF_ENCAP_IPV4_DST;
> +		rule->ipv4.dst = encap->ipv4.dst;
> +	} else {
> +		rule->mask = MLX5_FLOW_TCF_ENCAP_IPV6_DST;
> +		memcpy(&rule->ipv6.dst, &encap->ipv6.dst,
> +					sizeof(rule->ipv6.dst));
> +	}
> +	memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
> +	ret = flow_tcf_rule_neigh(tcf, encap, vtep->ifouter, true, error);
> +	if (ret) {
> +		rte_free(rule);
> +		return ret;
> +	}
> +	rule->refcnt++;
> +	LIST_INSERT_HEAD(&vtep->neigh, rule, next);
> +	return 0;
> +}
> +
>  /* VTEP device list is shared between PMD port instances. */
>  static LIST_HEAD(, mlx5_flow_tcf_vtep)
>  			vtep_list_vxlan = LIST_HEAD_INITIALIZER();
> @@ -3715,6 +4085,7 @@ static LIST_HEAD(, mlx5_flow_tcf_vtep)
>  {
>  	static uint16_t encap_port = MLX5_VXLAN_PORT_RANGE_MIN - 1;
>  	struct mlx5_flow_tcf_vtep *vtep, *vlst;
> +	int ret;
>  
>  	assert(ifouter);
>  	/* Look whether the attached VTEP for encap is created. */
> @@ -3766,6 +4137,21 @@ static LIST_HEAD(, mlx5_flow_tcf_vtep)
>  	}
>  	if (!vtep)
>  		return 0;
> +	/* Create local ipaddr with peer to specify the outer IPs. */
> +	ret = flow_tcf_encap_local(tcf, vtep, dev_flow, true, error);
> +	if (ret) {
> +		if (!vtep->refcnt)
> +			flow_tcf_delete_iface(tcf, vtep);

There's no possibility of decreasing vtep->refcnt in flow_tcf_encap_local(),
then why do you expect it to be zero here? If it is already zero at this point,
it should've been deleted when it became zero.

> +		return 0;
> +	}
> +	/* Create neigh rule to specify outer destination MAC. */
> +	ret = flow_tcf_encap_neigh(tcf, vtep, dev_flow, true, error);
> +	if (ret) {
> +		flow_tcf_encap_local(tcf, vtep, dev_flow, false, error);
> +		if (!vtep->refcnt)
> +			flow_tcf_delete_iface(tcf, vtep);

Same here.

Thanks,
Yongseok

> +		return 0;
> +	}
>  	vtep->refcnt++;
>  	assert(vtep->ifindex);
>  	return vtep->ifindex;
> @@ -3848,11 +4234,9 @@ static LIST_HEAD(, mlx5_flow_tcf_vtep)
>  	case MLX5_FLOW_TCF_TUNACT_VXLAN_DECAP:
>  		break;
>  	case MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP:
> -/*
> - * TODO: Remove the encap ancillary rules first.
> - * flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL);
> - * flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL);
> - */
> +		/* Remove the encap ancillary rules first. */
> +		flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL);
> +		flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL);
>  		break;
>  	default:
>  		assert(false);
>
diff mbox series

Patch

diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c
index efa9c3b..a1d7733 100644
--- a/drivers/net/mlx5/mlx5_flow_tcf.c
+++ b/drivers/net/mlx5/mlx5_flow_tcf.c
@@ -3443,6 +3443,376 @@  struct pedit_parser {
 	return -err;
 }
 
+/**
+ * Emit Netlink message to add/remove local address to the outer device.
+ * The address being added is visible within the link only (scope link).
+ *
+ * Note that an implicit route is maintained by the kernel due to the
+ * presence of a peer address (IFA_ADDRESS).
+ *
+ * These rules are used for encapsultion only and allow to assign
+ * the outer tunnel source IP address.
+ *
+ * @param[in] tcf
+ *   Libmnl socket context object.
+ * @param[in] encap
+ *   Encapsulation properties (source address and its peer).
+ * @param[in] ifindex
+ *   Network interface to apply rule.
+ * @param[in] enable
+ *   Toggle between add and remove.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
+		    const struct mlx5_flow_tcf_vxlan_encap *encap,
+		    unsigned int ifindex,
+		    bool enable,
+		    struct rte_flow_error *error)
+{
+	struct nlmsghdr *nlh;
+	struct ifaddrmsg *ifa;
+	alignas(struct nlmsghdr)
+	uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
+
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
+	nlh->nlmsg_flags =
+		NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
+	nlh->nlmsg_seq = 0;
+	ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
+	ifa->ifa_flags = IFA_F_PERMANENT;
+	ifa->ifa_scope = RT_SCOPE_LINK;
+	ifa->ifa_index = ifindex;
+	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC) {
+		ifa->ifa_family = AF_INET;
+		ifa->ifa_prefixlen = 32;
+		mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
+		if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST)
+			mnl_attr_put_u32(nlh, IFA_ADDRESS,
+					      encap->ipv4.dst);
+	} else {
+		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC);
+		ifa->ifa_family = AF_INET6;
+		ifa->ifa_prefixlen = 128;
+		mnl_attr_put(nlh, IFA_LOCAL,
+				  sizeof(encap->ipv6.src),
+				  &encap->ipv6.src);
+		if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST)
+			mnl_attr_put(nlh, IFA_ADDRESS,
+					  sizeof(encap->ipv6.dst),
+					  &encap->ipv6.dst);
+	}
+	if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL))
+		return 0;
+	return rte_flow_error_set
+		(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+		 "netlink: cannot complete IFA request (ip addr add)");
+}
+
+/**
+ * Emit Netlink message to add/remove neighbor.
+ *
+ * @param[in] tcf
+ *   Libmnl socket context object.
+ * @param[in] encap
+ *   Encapsulation properties (destination address).
+ * @param[in] ifindex
+ *   Network interface.
+ * @param[in] enable
+ *   Toggle between add and remove.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
+		     const struct mlx5_flow_tcf_vxlan_encap *encap,
+		     unsigned int ifindex,
+		     bool enable,
+		     struct rte_flow_error *error)
+{
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+	alignas(struct nlmsghdr)
+	uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
+
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
+	nlh->nlmsg_flags =
+		NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
+	nlh->nlmsg_seq = 0;
+	ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
+	ndm->ndm_ifindex = ifindex;
+	ndm->ndm_state = NUD_PERMANENT;
+	ndm->ndm_flags = 0;
+	ndm->ndm_type = 0;
+	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST) {
+		ndm->ndm_family = AF_INET;
+		mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
+	} else {
+		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST);
+		ndm->ndm_family = AF_INET6;
+		mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
+						 &encap->ipv6.dst);
+	}
+	if (encap->mask & MLX5_FLOW_TCF_ENCAP_ETH_SRC && enable)
+		DRV_LOG(WARNING,
+			"Outer ethernet source address cannot be "
+			"forced for VXLAN encapsulation");
+	if (encap->mask & MLX5_FLOW_TCF_ENCAP_ETH_DST)
+		mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
+						    &encap->eth.dst);
+	if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL))
+		return 0;
+	return rte_flow_error_set
+		(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+		 "netlink: cannot complete ND request (ip neigh)");
+}
+
+/**
+ * Manage the local IP addresses and their peers IP addresses on the
+ * outer interface for encapsulation purposes. The kernel searches the
+ * appropriate device for tunnel egress traffic using the outer source
+ * IP, this IP should be assigned to the outer network device, otherwise
+ * kernel rejects the rule.
+ *
+ * Adds or removes the addresses using the Netlink command like this:
+ *   ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
+ *
+ * The addresses are local to the netdev ("scope link"), this reduces
+ * the risk of conflicts. Note that an implicit route is maintained by
+ * the kernel due to the presence of a peer address (IFA_ADDRESS).
+ *
+ * @param[in] tcf
+ *   Libmnl socket context object.
+ * @param[in] vtep
+ *   VTEP object, contains rule database and ifouter index.
+ * @param[in] dev_flow
+ *   Flow object, contains the tunnel parameters (for encap only).
+ * @param[in] enable
+ *   Toggle between add and remove.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
+		     struct mlx5_flow_tcf_vtep *vtep,
+		     struct mlx5_flow *dev_flow,
+		     bool enable,
+		     struct rte_flow_error *error)
+{
+	const struct mlx5_flow_tcf_vxlan_encap *encap =
+						dev_flow->tcf.vxlan_encap;
+	struct tcf_local_rule *rule;
+	bool found = false;
+	int ret;
+
+	assert(encap);
+	assert(encap->hdr.type == MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP);
+	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC) {
+		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST);
+		LIST_FOREACH(rule, &vtep->local, next) {
+			if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC &&
+			    encap->ipv4.src == rule->ipv4.src &&
+			    encap->ipv4.dst == rule->ipv4.dst) {
+				found = true;
+				break;
+			}
+		}
+	} else {
+		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC);
+		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST);
+		LIST_FOREACH(rule, &vtep->local, next) {
+			if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC &&
+			    !memcmp(&encap->ipv6.src, &rule->ipv6.src,
+					    sizeof(encap->ipv6.src)) &&
+			    !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
+					    sizeof(encap->ipv6.dst))) {
+				found = true;
+				break;
+			}
+		}
+	}
+	if (found) {
+		if (enable) {
+			rule->refcnt++;
+			return 0;
+		}
+		if (!rule->refcnt || !--rule->refcnt) {
+			LIST_REMOVE(rule, next);
+			return flow_tcf_rule_local(tcf, encap,
+					vtep->ifouter, false, error);
+		}
+		return 0;
+	}
+	if (!enable) {
+		DRV_LOG(WARNING, "Disabling not existing local rule");
+		rte_flow_error_set
+			(error, ENOENT, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+			 NULL, "Disabling not existing local rule");
+		return -ENOENT;
+	}
+	rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
+				alignof(struct tcf_local_rule));
+	if (!rule) {
+		rte_flow_error_set
+			(error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+			 NULL, "unable to allocate memory for local rule");
+		return -rte_errno;
+	}
+	*rule = (struct tcf_local_rule){.refcnt = 0,
+					.mask = 0,
+					};
+	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC) {
+		rule->mask = MLX5_FLOW_TCF_ENCAP_IPV4_SRC
+			   | MLX5_FLOW_TCF_ENCAP_IPV4_DST;
+		rule->ipv4.src = encap->ipv4.src;
+		rule->ipv4.dst = encap->ipv4.dst;
+	} else {
+		rule->mask = MLX5_FLOW_TCF_ENCAP_IPV6_SRC
+			   | MLX5_FLOW_TCF_ENCAP_IPV6_DST;
+		memcpy(&rule->ipv6.src, &encap->ipv6.src,
+				sizeof(rule->ipv6.src));
+		memcpy(&rule->ipv6.dst, &encap->ipv6.dst,
+				sizeof(rule->ipv6.dst));
+	}
+	ret = flow_tcf_rule_local(tcf, encap, vtep->ifouter, true, error);
+	if (ret) {
+		rte_free(rule);
+		return ret;
+	}
+	rule->refcnt++;
+	LIST_INSERT_HEAD(&vtep->local, rule, next);
+	return 0;
+}
+
+/**
+ * Manage the destination MAC/IP addresses neigh database, kernel uses
+ * this one to determine the destination MAC address within encapsulation
+ * header. Adds or removes the entries using the Netlink command like this:
+ *   ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
+ *
+ * @param[in] tcf
+ *   Libmnl socket context object.
+ * @param[in] vtep
+ *   VTEP object, contains rule database and ifouter index.
+ * @param[in] dev_flow
+ *   Flow object, contains the tunnel parameters (for encap only).
+ * @param[in] enable
+ *   Toggle between add and remove.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
+		     struct mlx5_flow_tcf_vtep *vtep,
+		     struct mlx5_flow *dev_flow,
+		     bool enable,
+		     struct rte_flow_error *error)
+{
+	const struct mlx5_flow_tcf_vxlan_encap *encap =
+						dev_flow->tcf.vxlan_encap;
+	struct tcf_neigh_rule *rule;
+	bool found = false;
+	int ret;
+
+	assert(encap);
+	assert(encap->hdr.type == MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP);
+	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST) {
+		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC);
+		LIST_FOREACH(rule, &vtep->neigh, next) {
+			if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST &&
+			    encap->ipv4.dst == rule->ipv4.dst) {
+				found = true;
+				break;
+			}
+		}
+	} else {
+		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC);
+		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST);
+		LIST_FOREACH(rule, &vtep->neigh, next) {
+			if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST &&
+			    !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
+						sizeof(encap->ipv6.dst))) {
+				found = true;
+				break;
+			}
+		}
+	}
+	if (found) {
+		if (memcmp(&encap->eth.dst, &rule->eth,
+			   sizeof(encap->eth.dst))) {
+			DRV_LOG(WARNING, "Destination MAC differs"
+					 " in neigh rule");
+			rte_flow_error_set(error, EEXIST,
+					   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					   NULL, "Different MAC address"
+					   " neigh rule for the same"
+					   " destination IP");
+					return -EEXIST;
+		}
+		if (enable) {
+			rule->refcnt++;
+			return 0;
+		}
+		if (!rule->refcnt || !--rule->refcnt) {
+			LIST_REMOVE(rule, next);
+			return flow_tcf_rule_neigh(tcf, encap,
+						   vtep->ifouter,
+						   false, error);
+		}
+		return 0;
+	}
+	if (!enable) {
+		DRV_LOG(WARNING, "Disabling not existing neigh rule");
+		rte_flow_error_set
+			(error, ENOENT, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+			 NULL, "unable to allocate memory for neigh rule");
+		return -ENOENT;
+	}
+	rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
+				alignof(struct tcf_neigh_rule));
+	if (!rule) {
+		rte_flow_error_set
+			(error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+			 NULL, "unadble to allocate memory for neigh rule");
+		return -rte_errno;
+	}
+	*rule = (struct tcf_neigh_rule){.refcnt = 0,
+					.mask = 0,
+					};
+	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST) {
+		rule->mask = MLX5_FLOW_TCF_ENCAP_IPV4_DST;
+		rule->ipv4.dst = encap->ipv4.dst;
+	} else {
+		rule->mask = MLX5_FLOW_TCF_ENCAP_IPV6_DST;
+		memcpy(&rule->ipv6.dst, &encap->ipv6.dst,
+					sizeof(rule->ipv6.dst));
+	}
+	memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
+	ret = flow_tcf_rule_neigh(tcf, encap, vtep->ifouter, true, error);
+	if (ret) {
+		rte_free(rule);
+		return ret;
+	}
+	rule->refcnt++;
+	LIST_INSERT_HEAD(&vtep->neigh, rule, next);
+	return 0;
+}
+
 /* VTEP device list is shared between PMD port instances. */
 static LIST_HEAD(, mlx5_flow_tcf_vtep)
 			vtep_list_vxlan = LIST_HEAD_INITIALIZER();
@@ -3715,6 +4085,7 @@  static LIST_HEAD(, mlx5_flow_tcf_vtep)
 {
 	static uint16_t encap_port = MLX5_VXLAN_PORT_RANGE_MIN - 1;
 	struct mlx5_flow_tcf_vtep *vtep, *vlst;
+	int ret;
 
 	assert(ifouter);
 	/* Look whether the attached VTEP for encap is created. */
@@ -3766,6 +4137,21 @@  static LIST_HEAD(, mlx5_flow_tcf_vtep)
 	}
 	if (!vtep)
 		return 0;
+	/* Create local ipaddr with peer to specify the outer IPs. */
+	ret = flow_tcf_encap_local(tcf, vtep, dev_flow, true, error);
+	if (ret) {
+		if (!vtep->refcnt)
+			flow_tcf_delete_iface(tcf, vtep);
+		return 0;
+	}
+	/* Create neigh rule to specify outer destination MAC. */
+	ret = flow_tcf_encap_neigh(tcf, vtep, dev_flow, true, error);
+	if (ret) {
+		flow_tcf_encap_local(tcf, vtep, dev_flow, false, error);
+		if (!vtep->refcnt)
+			flow_tcf_delete_iface(tcf, vtep);
+		return 0;
+	}
 	vtep->refcnt++;
 	assert(vtep->ifindex);
 	return vtep->ifindex;
@@ -3848,11 +4234,9 @@  static LIST_HEAD(, mlx5_flow_tcf_vtep)
 	case MLX5_FLOW_TCF_TUNACT_VXLAN_DECAP:
 		break;
 	case MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP:
-/*
- * TODO: Remove the encap ancillary rules first.
- * flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL);
- * flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL);
- */
+		/* Remove the encap ancillary rules first. */
+		flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL);
+		flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL);
 		break;
 	default:
 		assert(false);