[dpdk-dev,v5,3/3] net/failsafe: fix calling device during RMV events

Message ID 1518107653-15466-4-git-send-email-matan@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK

Commit Message

Matan Azrad Feb. 8, 2018, 4:34 p.m. UTC
  Following are the failure steps:
1. The physical device is removed due to change in one of PF parameters
(e.g. MTU) 2. The interrupt thread flags the device 3. Within 2 seconds
Interrupt thread initializes the actual device removal, then every 2
seconds it tries to re-sync (plug in) the device. The trials fail as
long as VF parameter mismatches the PF parameter.
4. A control thread initiates a control operation on failsafe which
initiates this operation on the device.
5. A race condition occurs between the control thread and interrupt
thread when accessing the device data structures.

This patch mitigates the race condition in step 5.

Fixes: a46f8d584eb8 ("net/failsafe: add fail-safe PMD")
Cc: stable@dpdk.org

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/failsafe/failsafe.c         |  2 +-
 drivers/net/failsafe/failsafe_eal.c     |  2 +-
 drivers/net/failsafe/failsafe_ether.c   |  2 +-
 drivers/net/failsafe/failsafe_ops.c     | 26 +++++++++++++++++--------
 drivers/net/failsafe/failsafe_private.h | 34 ++++++++++++++++++++++++++-------
 5 files changed, 48 insertions(+), 18 deletions(-)
  

Comments

Gaëtan Rivet Feb. 8, 2018, 6:11 p.m. UTC | #1
On Thu, Feb 08, 2018 at 04:34:13PM +0000, Matan Azrad wrote:
> Following are the failure steps:
> 1. The physical device is removed due to change in one of PF parameters
> (e.g. MTU) 2. The interrupt thread flags the device 3. Within 2 seconds
> Interrupt thread initializes the actual device removal, then every 2
> seconds it tries to re-sync (plug in) the device. The trials fail as
> long as VF parameter mismatches the PF parameter.
> 4. A control thread initiates a control operation on failsafe which
> initiates this operation on the device.
> 5. A race condition occurs between the control thread and interrupt
> thread when accessing the device data structures.
> 
> This patch mitigates the race condition in step 5.
> 
> Fixes: a46f8d584eb8 ("net/failsafe: add fail-safe PMD")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>
> ---
>  drivers/net/failsafe/failsafe.c         |  2 +-
>  drivers/net/failsafe/failsafe_eal.c     |  2 +-
>  drivers/net/failsafe/failsafe_ether.c   |  2 +-
>  drivers/net/failsafe/failsafe_ops.c     | 26 +++++++++++++++++--------
>  drivers/net/failsafe/failsafe_private.h | 34 ++++++++++++++++++++++++++-------
>  5 files changed, 48 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/net/failsafe/failsafe.c b/drivers/net/failsafe/failsafe.c
> index 7b2cdbb..6cdefd0 100644
> --- a/drivers/net/failsafe/failsafe.c
> +++ b/drivers/net/failsafe/failsafe.c
> @@ -187,7 +187,7 @@
>  		 * If MAC address was provided as a parameter,
>  		 * apply to all probed slaves.
>  		 */
> -		FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_PROBED) {
> +		FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_PROBED) {

No need for the UNSAFE here. The ports should have been just
initialized, and sdev->remove should be 0.

If sdev->remove is 1, then it means it has been set already by a plugout
event, meaning that rte_eth_dev_default_mac_addr_set should not even be
called on it.

>  			ret = rte_eth_dev_default_mac_addr_set(PORT_ID(sdev),
>  							       mac);
>  			if (ret) {
> diff --git a/drivers/net/failsafe/failsafe_eal.c b/drivers/net/failsafe/failsafe_eal.c
> index c3d6731..b3b9c32 100644
> --- a/drivers/net/failsafe/failsafe_eal.c
> +++ b/drivers/net/failsafe/failsafe_eal.c
> @@ -126,7 +126,7 @@
>  	int sdev_ret;
>  	int ret = 0;
>  
> -	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_PROBED) {
> +	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_PROBED) {
>  		sdev_ret = rte_eal_hotplug_remove(sdev->bus->name,
>  							sdev->dev->name);
>  		if (sdev_ret) {
> diff --git a/drivers/net/failsafe/failsafe_ether.c b/drivers/net/failsafe/failsafe_ether.c
> index ca42376..f2a52c9 100644
> --- a/drivers/net/failsafe/failsafe_ether.c
> +++ b/drivers/net/failsafe/failsafe_ether.c
> @@ -325,7 +325,7 @@
>  	struct sub_device *sdev;
>  	uint8_t i;
>  
> -	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE)
> +	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE)
>  		if (sdev->remove && fs_rxtx_clean(sdev)) {
>  			fs_dev_stats_save(sdev);
>  			fs_dev_remove(sdev);
> diff --git a/drivers/net/failsafe/failsafe_ops.c b/drivers/net/failsafe/failsafe_ops.c
> index a7c2dba..3d2cb32 100644
> --- a/drivers/net/failsafe/failsafe_ops.c
> +++ b/drivers/net/failsafe/failsafe_ops.c
> @@ -181,6 +181,9 @@
>  	FOREACH_SUBDEV(sdev, i, dev) {
>  		if (sdev->state != DEV_ACTIVE)
>  			continue;
> +		if (sdev->remove == 1 && PRIV(dev)->state < DEV_STARTED)
> +			/* Application shouldn't start removed sub-devices. */
> +			continue;

FOREACH_SUBDEV should already have avoided sub-devices which remove flag
is 1.

If not, then the fs_err(sdev, ret) stanza right after will let the loop
continue, and the port should be handled by the next slave cleanup.

>  		DEBUG("Starting sub_device %d", i);
>  		ret = rte_eth_dev_start(PORT_ID(sdev));
>  		if (ret) {
> @@ -265,10 +268,17 @@
>  	uint8_t i;
>  
>  	failsafe_hotplug_alarm_cancel(dev);
> -	if (PRIV(dev)->state == DEV_STARTED)
> +	if (PRIV(dev)->state == DEV_STARTED) {
> +		/*
> +		 * Clean remove flags to allow stop for all sub-devices because
> +		 * there is not hot-plug alarm to stop the removed sub-devices.
> +		 */
> +		FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_STARTED)
> +			sdev->remove = 0;

Why make this conditional to state == DEV_STARTED?

>  		dev->dev_ops->dev_stop(dev);
> +	}
>  	PRIV(dev)->state = DEV_ACTIVE - 1;
> -	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
> +	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE) {
>  		DEBUG("Closing sub_device %d", i);
>  		rte_eth_dev_close(PORT_ID(sdev));
>  		sdev->state = DEV_ACTIVE - 1;

-->

	/*
	 * Clean remove flags to allow stop for all sub-devices because
	 * there is no hot-plug alarm to clean the removed sub-devices.
	 */
	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE)
		sdev->remove = 0;
	if (PRIV(dev)->state == DEV_STARTED)
		dev->dev_ops->dev_stop(dev);
	PRIV(dev)->state = DEV_ACTIVE - 1;
	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
		DEBUG("Closing sub_device %d", i);
		rte_eth_dev_close(PORT_ID(sdev));
		sdev->state = DEV_ACTIVE - 1;

> @@ -309,7 +319,7 @@
>  	if (rxq->event_fd > 0)
>  		close(rxq->event_fd);
>  	dev = rxq->priv->dev;
> -	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE)
> +	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE)

No need here, as you would have reset sdev->remove if the port was
closing, or it would be dealt with by fs_dev_remove if the alarm is
still running.

>  		SUBOPS(sdev, rx_queue_release)
>  			(ETH(sdev)->data->rx_queues[rxq->qid]);
>  	dev->data->rx_queues[rxq->qid] = NULL;
> @@ -376,7 +386,7 @@

you really should update your git, it is difficult to verify these
changes without the function contexts.

>  		return ret;
>  	rxq->event_fd = intr_handle.efds[0];
>  	dev->data->rx_queues[rx_queue_id] = rxq;
> -	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
> +	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE) {

Why should we setup queues on ports marked for removal?

>  		ret = rte_eth_rx_queue_setup(PORT_ID(sdev),
>  				rx_queue_id,
>  				nb_rx_desc, socket_id,
> @@ -493,7 +503,7 @@
>  		return;
>  	txq = queue;
>  	dev = txq->priv->dev;
> -	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE)
> +	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE)

Same as for rx_queue_release: either the device is closing, and the flag
has been reset, or the alarm is still running and will take care of
this.

>  		SUBOPS(sdev, tx_queue_release)
>  			(ETH(sdev)->data->tx_queues[txq->qid]);

Actually, now that I think about it, there seems to be an issue with
queues not released on plugout?

In fs_dev_remove, we only do the general dev_stop and dev_close on the
sub_device.

shouldn't we call tx_queue_release as well before?

>  	dev->data->tx_queues[txq->qid] = NULL;
> @@ -548,7 +558,7 @@
>  	txq->info.nb_desc = nb_tx_desc;
>  	txq->priv = PRIV(dev);
>  	dev->data->tx_queues[tx_queue_id] = txq;
> -	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
> +	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE) {

Why using the UNSAFE operator for a setup operation? (Same as for
rx_queue_setup.)

>  		ret = rte_eth_tx_queue_setup(PORT_ID(sdev),
>  				tx_queue_id,
>  				nb_tx_desc, socket_id,
> @@ -663,7 +673,7 @@
>  	int ret;
>  
>  	rte_memcpy(stats, &PRIV(dev)->stats_accumulator, sizeof(*stats));
> -	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
> +	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE) {

Why do you want to attempt a stat read on port bound for removal?

>  		struct rte_eth_stats *snapshot = &sdev->stats_snapshot.stats;
>  		uint64_t *timestamp = &sdev->stats_snapshot.timestamp;
>  
> @@ -746,7 +756,7 @@
>  
>  		rx_offload_capa = default_infos.rx_offload_capa;
>  		rxq_offload_capa = default_infos.rx_queue_offload_capa;
> -		FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_PROBED) {
> +		FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_PROBED) {

same here.

>  			rte_eth_dev_info_get(PORT_ID(sdev),
>  					&PRIV(dev)->infos);
>  			rx_offload_capa &= PRIV(dev)->infos.rx_offload_capa;
> diff --git a/drivers/net/failsafe/failsafe_private.h b/drivers/net/failsafe/failsafe_private.h
> index f3be152..7ddd63a 100644
> --- a/drivers/net/failsafe/failsafe_private.h
> +++ b/drivers/net/failsafe/failsafe_private.h
> @@ -244,16 +244,31 @@ int failsafe_eth_lsc_event_callback(uint16_t port_id,
>  	((sdev)->sid)
>  
>  /**
> - * Stateful iterator construct over fail-safe sub-devices:
> + * Stateful iterator construct over fail-safe sub-devices,
> + * including the removed sub-devices:

"including sub-devices marked for removal" is more correct here, as the
device is not actually removed yet, only scheduled for.

> + * s:     (struct sub_device *), iterator
> + * i:     (uint8_t), increment
> + * dev:   (struct rte_eth_dev *), fail-safe ethdev
> + * state: (enum dev_state), minimum acceptable device state
> + */
> +

Here the same documentation as for other macros: parameters type, quick
description of what it does.

> +#define FOREACH_SUBDEV_STATE_UNSAFE(s, i, dev, state)	\
> +	for (s = fs_find_next((dev), 0, state, 0, &i);	\
> +	     s != NULL;					\
> +	     s = fs_find_next((dev), i + 1, state, 0, &i))
> +
> +/**
> + * Stateful iterator construct over fail-safe sub-devices,
> + * except the removed sub-devices:
>   * s:     (struct sub_device *), iterator
>   * i:     (uint8_t), increment
>   * dev:   (struct rte_eth_dev *), fail-safe ethdev
>   * state: (enum dev_state), minimum acceptable device state
>   */
>  #define FOREACH_SUBDEV_STATE(s, i, dev, state)		\
> -	for (s = fs_find_next((dev), 0, state, &i);	\
> +	for (s = fs_find_next((dev), 0, state, 1, &i);	\
>  	     s != NULL;					\
> -	     s = fs_find_next((dev), i + 1, state, &i))
> +	     s = fs_find_next((dev), i + 1, state, 1, &i))
>  
>  /**
>   * Iterator construct over fail-safe sub-devices:
> @@ -262,7 +277,7 @@ int failsafe_eth_lsc_event_callback(uint16_t port_id,
>   * dev: (struct rte_eth_dev *), fail-safe ethdev
>   */
>  #define FOREACH_SUBDEV(s, i, dev)			\
> -	FOREACH_SUBDEV_STATE(s, i, dev, DEV_UNDEFINED)
> +	FOREACH_SUBDEV_STATE_UNSAFE(s, i, dev, DEV_UNDEFINED)

No actually, the default case should be using the "SAFE" iterator, so no
change needed here.

>  
>  /* dev: (struct rte_eth_dev *) fail-safe device */
>  #define PREFERRED_SUBDEV(dev) \
> @@ -328,6 +343,7 @@ int failsafe_eth_lsc_event_callback(uint16_t port_id,
>  fs_find_next(struct rte_eth_dev *dev,
>  	     uint8_t sid,
>  	     enum dev_state min_state,
> +	     uint8_t check_remove,

skip_remove? Seems more descriptive.

>  	     uint8_t *sid_out)
>  {
>  	struct sub_device *subs;
> @@ -336,8 +352,12 @@ int failsafe_eth_lsc_event_callback(uint16_t port_id,
>  	subs = PRIV(dev)->subs;
>  	tail = PRIV(dev)->subs_tail;
>  	while (sid < tail) {
> -		if (subs[sid].state >= min_state)
> -			break;
> +		if (subs[sid].state >= min_state) {
> +			if (check_remove == 0)
> +				break;
> +			if (PRIV(dev)->subs[sid].remove == 0)
> +				break;
> +		}
>  		sid++;
>  	}
>  	*sid_out = sid;
> @@ -376,7 +396,7 @@ int failsafe_eth_lsc_event_callback(uint16_t port_id,
>  		uint8_t i;
>  
>  		/* Using acceptable device */
> -		FOREACH_SUBDEV_STATE(sdev, i, dev, req_state) {
> +		FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, req_state) {

Why should we switch emitting device to one marked for removal?

>  			if (sdev == banned)
>  				continue;
>  			DEBUG("Switching tx_dev to sub_device %d",
> -- 
> 1.8.3.1
> 

Regards,
  
Matan Azrad Feb. 8, 2018, 7:24 p.m. UTC | #2
Hi Gaetan

> From: Gaëtan Rivet, Thursday, February 8, 2018 8:11 PM
> On Thu, Feb 08, 2018 at 04:34:13PM +0000, Matan Azrad wrote:
> > Following are the failure steps:
> > 1. The physical device is removed due to change in one of PF
> > parameters (e.g. MTU) 2. The interrupt thread flags the device 3.
> > Within 2 seconds Interrupt thread initializes the actual device
> > removal, then every 2 seconds it tries to re-sync (plug in) the
> > device. The trials fail as long as VF parameter mismatches the PF
> parameter.
> > 4. A control thread initiates a control operation on failsafe which
> > initiates this operation on the device.
> > 5. A race condition occurs between the control thread and interrupt
> > thread when accessing the device data structures.
> >
> > This patch mitigates the race condition in step 5.
> >
> > Fixes: a46f8d584eb8 ("net/failsafe: add fail-safe PMD")
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > ---
> >  drivers/net/failsafe/failsafe.c         |  2 +-
> >  drivers/net/failsafe/failsafe_eal.c     |  2 +-
> >  drivers/net/failsafe/failsafe_ether.c   |  2 +-
> >  drivers/net/failsafe/failsafe_ops.c     | 26 +++++++++++++++++--------
> >  drivers/net/failsafe/failsafe_private.h | 34
> > ++++++++++++++++++++++++++-------
> >  5 files changed, 48 insertions(+), 18 deletions(-)
> >
> > diff --git a/drivers/net/failsafe/failsafe.c
> > b/drivers/net/failsafe/failsafe.c index 7b2cdbb..6cdefd0 100644
> > --- a/drivers/net/failsafe/failsafe.c
> > +++ b/drivers/net/failsafe/failsafe.c
> > @@ -187,7 +187,7 @@
> >  		 * If MAC address was provided as a parameter,
> >  		 * apply to all probed slaves.
> >  		 */
> > -		FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_PROBED) {
> > +		FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev,
> DEV_PROBED) {
> 
> No need for the UNSAFE here. The ports should have been just initialized,
> and sdev->remove should be 0.

So, the check is not relevant, why to do so? The UNSAFE skip the check.

> If sdev->remove is 1, then it means it has been set already by a plugout
> event, meaning that rte_eth_dev_default_mac_addr_set should not even
> be called on it.
> 
> >  			ret =
> rte_eth_dev_default_mac_addr_set(PORT_ID(sdev),
> >  							       mac);
> >  			if (ret) {
> > diff --git a/drivers/net/failsafe/failsafe_eal.c
> > b/drivers/net/failsafe/failsafe_eal.c
> > index c3d6731..b3b9c32 100644
> > --- a/drivers/net/failsafe/failsafe_eal.c
> > +++ b/drivers/net/failsafe/failsafe_eal.c
> > @@ -126,7 +126,7 @@
> >  	int sdev_ret;
> >  	int ret = 0;
> >
> > -	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_PROBED) {
> > +	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_PROBED) {
> >  		sdev_ret = rte_eal_hotplug_remove(sdev->bus->name,
> >  							sdev->dev->name);
> >  		if (sdev_ret) {
> > diff --git a/drivers/net/failsafe/failsafe_ether.c
> > b/drivers/net/failsafe/failsafe_ether.c
> > index ca42376..f2a52c9 100644
> > --- a/drivers/net/failsafe/failsafe_ether.c
> > +++ b/drivers/net/failsafe/failsafe_ether.c
> > @@ -325,7 +325,7 @@
> >  	struct sub_device *sdev;
> >  	uint8_t i;
> >
> > -	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE)
> > +	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE)
> >  		if (sdev->remove && fs_rxtx_clean(sdev)) {
> >  			fs_dev_stats_save(sdev);
> >  			fs_dev_remove(sdev);
> > diff --git a/drivers/net/failsafe/failsafe_ops.c
> > b/drivers/net/failsafe/failsafe_ops.c
> > index a7c2dba..3d2cb32 100644
> > --- a/drivers/net/failsafe/failsafe_ops.c
> > +++ b/drivers/net/failsafe/failsafe_ops.c
> > @@ -181,6 +181,9 @@
> >  	FOREACH_SUBDEV(sdev, i, dev) {
> >  		if (sdev->state != DEV_ACTIVE)
> >  			continue;
> > +		if (sdev->remove == 1 && PRIV(dev)->state < DEV_STARTED)
> > +			/* Application shouldn't start removed sub-devices.
> */
> > +			continue;
> 
> FOREACH_SUBDEV should already have avoided sub-devices which remove
> flag is 1.

fs_dev_start() is called by the alarm thread too to restart a removed device(marked by remove flag), so it should not condition the remove flag.

> If not, then the fs_err(sdev, ret) stanza right after will let the loop continue,
> and the port should be handled by the next slave cleanup.
> 
> >  		DEBUG("Starting sub_device %d", i);
> >  		ret = rte_eth_dev_start(PORT_ID(sdev));
> >  		if (ret) {
> > @@ -265,10 +268,17 @@
> >  	uint8_t i;
> >
> >  	failsafe_hotplug_alarm_cancel(dev);
> > -	if (PRIV(dev)->state == DEV_STARTED)
> > +	if (PRIV(dev)->state == DEV_STARTED) {
> > +		/*
> > +		 * Clean remove flags to allow stop for all sub-devices
> because
> > +		 * there is not hot-plug alarm to stop the removed sub-
> devices.
> > +		 */
> > +		FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev,
> DEV_STARTED)
> > +			sdev->remove = 0;

> Why make this conditional to state == DEV_STARTED?
> >  		dev->dev_ops->dev_stop(dev);
> > +	}
> >  	PRIV(dev)->state = DEV_ACTIVE - 1;
> > -	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
> > +	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE) {
> >  		DEBUG("Closing sub_device %d", i);
> >  		rte_eth_dev_close(PORT_ID(sdev));
> >  		sdev->state = DEV_ACTIVE - 1;
> 
> -->
> 
> 	/*
> 	 * Clean remove flags to allow stop for all sub-devices because
> 	 * there is no hot-plug alarm to clean the removed sub-devices.
> 	 */
> 	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE)
> 		sdev->remove = 0;
> 	if (PRIV(dev)->state == DEV_STARTED)
> 		dev->dev_ops->dev_stop(dev);
> 	PRIV(dev)->state = DEV_ACTIVE - 1;
> 	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
> 		DEBUG("Closing sub_device %d", i);
> 		rte_eth_dev_close(PORT_ID(sdev));
> 		sdev->state = DEV_ACTIVE - 1;
>

Agree.
 
> > @@ -309,7 +319,7 @@
> >  	if (rxq->event_fd > 0)
> >  		close(rxq->event_fd);
> >  	dev = rxq->priv->dev;
> > -	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE)
> > +	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE)
> 
> No need here, as you would have reset sdev->remove if the port was
> closing, or it would be dealt with by fs_dev_remove if the alarm is still
> running.

Agree.

> 
> >  		SUBOPS(sdev, rx_queue_release)
> >  			(ETH(sdev)->data->rx_queues[rxq->qid]);
> >  	dev->data->rx_queues[rxq->qid] = NULL; @@ -376,7 +386,7 @@
> 
> you really should update your git, it is difficult to verify these changes
> without the function contexts.
>
Agree :) sorry.
 
> >  		return ret;
> >  	rxq->event_fd = intr_handle.efds[0];
> >  	dev->data->rx_queues[rx_queue_id] = rxq;
> > -	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
> > +	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE) {
> 
> Why should we setup queues on ports marked for removal?
> 

Need to change it. 

> >  		ret = rte_eth_rx_queue_setup(PORT_ID(sdev),
> >  				rx_queue_id,
> >  				nb_rx_desc, socket_id,
> > @@ -493,7 +503,7 @@
> >  		return;
> >  	txq = queue;
> >  	dev = txq->priv->dev;
> > -	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE)
> > +	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE)
> 
> Same as for rx_queue_release: either the device is closing, and the flag has
> been reset, or the alarm is still running and will take care of this.
> 

Agree.

> >  		SUBOPS(sdev, tx_queue_release)
> >  			(ETH(sdev)->data->tx_queues[txq->qid]);
> 
> Actually, now that I think about it, there seems to be an issue with queues
> not released on plugout?
> 
> In fs_dev_remove, we only do the general dev_stop and dev_close on the
> sub_device.
> 
> shouldn't we call tx_queue_release as well before?

Isn't it done by dev_close()?

> 
> >  	dev->data->tx_queues[txq->qid] = NULL; @@ -548,7 +558,7 @@
> >  	txq->info.nb_desc = nb_tx_desc;
> >  	txq->priv = PRIV(dev);
> >  	dev->data->tx_queues[tx_queue_id] = txq;
> > -	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
> > +	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE) {
> 
> Why using the UNSAFE operator for a setup operation? (Same as for
> rx_queue_setup.)
> 
No need , you right, all the queue operation should be safe too.

> >  		ret = rte_eth_tx_queue_setup(PORT_ID(sdev),
> >  				tx_queue_id,
> >  				nb_tx_desc, socket_id,
> > @@ -663,7 +673,7 @@
> >  	int ret;
> >
> >  	rte_memcpy(stats, &PRIV(dev)->stats_accumulator, sizeof(*stats));
> > -	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
> > +	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE) {
> 
> Why do you want to attempt a stat read on port bound for removal?

SW counters may success, this function deals with removal case.

> >  		struct rte_eth_stats *snapshot = &sdev-
> >stats_snapshot.stats;
> >  		uint64_t *timestamp = &sdev->stats_snapshot.timestamp;
> >
> > @@ -746,7 +756,7 @@
> >
> >  		rx_offload_capa = default_infos.rx_offload_capa;
> >  		rxq_offload_capa = default_infos.rx_queue_offload_capa;
> > -		FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_PROBED) {
> > +		FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev,
> DEV_PROBED) {
> 
> same here.

No need the check, so why?

> 
> >  			rte_eth_dev_info_get(PORT_ID(sdev),
> >  					&PRIV(dev)->infos);
> >  			rx_offload_capa &= PRIV(dev)-
> >infos.rx_offload_capa; diff --git
> > a/drivers/net/failsafe/failsafe_private.h
> > b/drivers/net/failsafe/failsafe_private.h
> > index f3be152..7ddd63a 100644
> > --- a/drivers/net/failsafe/failsafe_private.h
> > +++ b/drivers/net/failsafe/failsafe_private.h
> > @@ -244,16 +244,31 @@ int failsafe_eth_lsc_event_callback(uint16_t
> port_id,
> >  	((sdev)->sid)
> >
> >  /**
> > - * Stateful iterator construct over fail-safe sub-devices:
> > + * Stateful iterator construct over fail-safe sub-devices,
> > + * including the removed sub-devices:
> 
> "including sub-devices marked for removal" is more correct here, as the
> device is not actually removed yet, only scheduled for.
> 

Maybe "including unsynchronized sub-devices"? 

> > + * s:     (struct sub_device *), iterator
> > + * i:     (uint8_t), increment
> > + * dev:   (struct rte_eth_dev *), fail-safe ethdev
> > + * state: (enum dev_state), minimum acceptable device state */
> > +
> 
> Here the same documentation as for other macros: parameters type, quick
> description of what it does.
>

Don't understand you here.
 
> > +#define FOREACH_SUBDEV_STATE_UNSAFE(s, i, dev, state)	\
> > +	for (s = fs_find_next((dev), 0, state, 0, &i);	\
> > +	     s != NULL;					\
> > +	     s = fs_find_next((dev), i + 1, state, 0, &i))
> > +
> > +/**
> > + * Stateful iterator construct over fail-safe sub-devices,
> > + * except the removed sub-devices:
> >   * s:     (struct sub_device *), iterator
> >   * i:     (uint8_t), increment
> >   * dev:   (struct rte_eth_dev *), fail-safe ethdev
> >   * state: (enum dev_state), minimum acceptable device state
> >   */
> >  #define FOREACH_SUBDEV_STATE(s, i, dev, state)		\
> > -	for (s = fs_find_next((dev), 0, state, &i);	\
> > +	for (s = fs_find_next((dev), 0, state, 1, &i);	\
> >  	     s != NULL;					\
> > -	     s = fs_find_next((dev), i + 1, state, &i))
> > +	     s = fs_find_next((dev), i + 1, state, 1, &i))
> >
> >  /**
> >   * Iterator construct over fail-safe sub-devices:
> > @@ -262,7 +277,7 @@ int failsafe_eth_lsc_event_callback(uint16_t
> port_id,
> >   * dev: (struct rte_eth_dev *), fail-safe ethdev
> >   */
> >  #define FOREACH_SUBDEV(s, i, dev)			\
> > -	FOREACH_SUBDEV_STATE(s, i, dev, DEV_UNDEFINED)
> > +	FOREACH_SUBDEV_STATE_UNSAFE(s, i, dev, DEV_UNDEFINED)
> 
> No actually, the default case should be using the "SAFE" iterator, so no
> change needed here.

Also here, I think the check is unnecessary, so using UNSAFE skip it.

> >
> >  /* dev: (struct rte_eth_dev *) fail-safe device */  #define
> > PREFERRED_SUBDEV(dev) \ @@ -328,6 +343,7 @@ int
> > failsafe_eth_lsc_event_callback(uint16_t port_id,  fs_find_next(struct
> > rte_eth_dev *dev,
> >  	     uint8_t sid,
> >  	     enum dev_state min_state,
> > +	     uint8_t check_remove,
> 
> skip_remove? Seems more descriptive.
> 
Agree.

> >  	     uint8_t *sid_out)
> >  {
> >  	struct sub_device *subs;
> > @@ -336,8 +352,12 @@ int failsafe_eth_lsc_event_callback(uint16_t
> port_id,
> >  	subs = PRIV(dev)->subs;
> >  	tail = PRIV(dev)->subs_tail;
> >  	while (sid < tail) {
> > -		if (subs[sid].state >= min_state)
> > -			break;
> > +		if (subs[sid].state >= min_state) {
> > +			if (check_remove == 0)
> > +				break;
> > +			if (PRIV(dev)->subs[sid].remove == 0)
> > +				break;
> > +		}
> >  		sid++;
> >  	}
> >  	*sid_out = sid;
> > @@ -376,7 +396,7 @@ int failsafe_eth_lsc_event_callback(uint16_t
> port_id,
> >  		uint8_t i;
> >
> >  		/* Using acceptable device */
> > -		FOREACH_SUBDEV_STATE(sdev, i, dev, req_state) {
> > +		FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, req_state) {
> 
> Why should we switch emitting device to one marked for removal?

Agree, should be changed.

> >  			if (sdev == banned)
> >  				continue;
> >  			DEBUG("Switching tx_dev to sub_device %d",
> > --
> > 1.8.3.1
> >
> 
> Regards,
> --
> Gaëtan Rivet
> 6WIND
  

Patch

diff --git a/drivers/net/failsafe/failsafe.c b/drivers/net/failsafe/failsafe.c
index 7b2cdbb..6cdefd0 100644
--- a/drivers/net/failsafe/failsafe.c
+++ b/drivers/net/failsafe/failsafe.c
@@ -187,7 +187,7 @@ 
 		 * If MAC address was provided as a parameter,
 		 * apply to all probed slaves.
 		 */
-		FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_PROBED) {
+		FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_PROBED) {
 			ret = rte_eth_dev_default_mac_addr_set(PORT_ID(sdev),
 							       mac);
 			if (ret) {
diff --git a/drivers/net/failsafe/failsafe_eal.c b/drivers/net/failsafe/failsafe_eal.c
index c3d6731..b3b9c32 100644
--- a/drivers/net/failsafe/failsafe_eal.c
+++ b/drivers/net/failsafe/failsafe_eal.c
@@ -126,7 +126,7 @@ 
 	int sdev_ret;
 	int ret = 0;
 
-	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_PROBED) {
+	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_PROBED) {
 		sdev_ret = rte_eal_hotplug_remove(sdev->bus->name,
 							sdev->dev->name);
 		if (sdev_ret) {
diff --git a/drivers/net/failsafe/failsafe_ether.c b/drivers/net/failsafe/failsafe_ether.c
index ca42376..f2a52c9 100644
--- a/drivers/net/failsafe/failsafe_ether.c
+++ b/drivers/net/failsafe/failsafe_ether.c
@@ -325,7 +325,7 @@ 
 	struct sub_device *sdev;
 	uint8_t i;
 
-	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE)
+	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE)
 		if (sdev->remove && fs_rxtx_clean(sdev)) {
 			fs_dev_stats_save(sdev);
 			fs_dev_remove(sdev);
diff --git a/drivers/net/failsafe/failsafe_ops.c b/drivers/net/failsafe/failsafe_ops.c
index a7c2dba..3d2cb32 100644
--- a/drivers/net/failsafe/failsafe_ops.c
+++ b/drivers/net/failsafe/failsafe_ops.c
@@ -181,6 +181,9 @@ 
 	FOREACH_SUBDEV(sdev, i, dev) {
 		if (sdev->state != DEV_ACTIVE)
 			continue;
+		if (sdev->remove == 1 && PRIV(dev)->state < DEV_STARTED)
+			/* Application shouldn't start removed sub-devices. */
+			continue;
 		DEBUG("Starting sub_device %d", i);
 		ret = rte_eth_dev_start(PORT_ID(sdev));
 		if (ret) {
@@ -265,10 +268,17 @@ 
 	uint8_t i;
 
 	failsafe_hotplug_alarm_cancel(dev);
-	if (PRIV(dev)->state == DEV_STARTED)
+	if (PRIV(dev)->state == DEV_STARTED) {
+		/*
+		 * Clean remove flags to allow stop for all sub-devices because
+		 * there is not hot-plug alarm to stop the removed sub-devices.
+		 */
+		FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_STARTED)
+			sdev->remove = 0;
 		dev->dev_ops->dev_stop(dev);
+	}
 	PRIV(dev)->state = DEV_ACTIVE - 1;
-	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
+	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE) {
 		DEBUG("Closing sub_device %d", i);
 		rte_eth_dev_close(PORT_ID(sdev));
 		sdev->state = DEV_ACTIVE - 1;
@@ -309,7 +319,7 @@ 
 	if (rxq->event_fd > 0)
 		close(rxq->event_fd);
 	dev = rxq->priv->dev;
-	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE)
+	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE)
 		SUBOPS(sdev, rx_queue_release)
 			(ETH(sdev)->data->rx_queues[rxq->qid]);
 	dev->data->rx_queues[rxq->qid] = NULL;
@@ -376,7 +386,7 @@ 
 		return ret;
 	rxq->event_fd = intr_handle.efds[0];
 	dev->data->rx_queues[rx_queue_id] = rxq;
-	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
+	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE) {
 		ret = rte_eth_rx_queue_setup(PORT_ID(sdev),
 				rx_queue_id,
 				nb_rx_desc, socket_id,
@@ -493,7 +503,7 @@ 
 		return;
 	txq = queue;
 	dev = txq->priv->dev;
-	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE)
+	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE)
 		SUBOPS(sdev, tx_queue_release)
 			(ETH(sdev)->data->tx_queues[txq->qid]);
 	dev->data->tx_queues[txq->qid] = NULL;
@@ -548,7 +558,7 @@ 
 	txq->info.nb_desc = nb_tx_desc;
 	txq->priv = PRIV(dev);
 	dev->data->tx_queues[tx_queue_id] = txq;
-	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
+	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE) {
 		ret = rte_eth_tx_queue_setup(PORT_ID(sdev),
 				tx_queue_id,
 				nb_tx_desc, socket_id,
@@ -663,7 +673,7 @@ 
 	int ret;
 
 	rte_memcpy(stats, &PRIV(dev)->stats_accumulator, sizeof(*stats));
-	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
+	FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_ACTIVE) {
 		struct rte_eth_stats *snapshot = &sdev->stats_snapshot.stats;
 		uint64_t *timestamp = &sdev->stats_snapshot.timestamp;
 
@@ -746,7 +756,7 @@ 
 
 		rx_offload_capa = default_infos.rx_offload_capa;
 		rxq_offload_capa = default_infos.rx_queue_offload_capa;
-		FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_PROBED) {
+		FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, DEV_PROBED) {
 			rte_eth_dev_info_get(PORT_ID(sdev),
 					&PRIV(dev)->infos);
 			rx_offload_capa &= PRIV(dev)->infos.rx_offload_capa;
diff --git a/drivers/net/failsafe/failsafe_private.h b/drivers/net/failsafe/failsafe_private.h
index f3be152..7ddd63a 100644
--- a/drivers/net/failsafe/failsafe_private.h
+++ b/drivers/net/failsafe/failsafe_private.h
@@ -244,16 +244,31 @@  int failsafe_eth_lsc_event_callback(uint16_t port_id,
 	((sdev)->sid)
 
 /**
- * Stateful iterator construct over fail-safe sub-devices:
+ * Stateful iterator construct over fail-safe sub-devices,
+ * including the removed sub-devices:
+ * s:     (struct sub_device *), iterator
+ * i:     (uint8_t), increment
+ * dev:   (struct rte_eth_dev *), fail-safe ethdev
+ * state: (enum dev_state), minimum acceptable device state
+ */
+
+#define FOREACH_SUBDEV_STATE_UNSAFE(s, i, dev, state)	\
+	for (s = fs_find_next((dev), 0, state, 0, &i);	\
+	     s != NULL;					\
+	     s = fs_find_next((dev), i + 1, state, 0, &i))
+
+/**
+ * Stateful iterator construct over fail-safe sub-devices,
+ * except the removed sub-devices:
  * s:     (struct sub_device *), iterator
  * i:     (uint8_t), increment
  * dev:   (struct rte_eth_dev *), fail-safe ethdev
  * state: (enum dev_state), minimum acceptable device state
  */
 #define FOREACH_SUBDEV_STATE(s, i, dev, state)		\
-	for (s = fs_find_next((dev), 0, state, &i);	\
+	for (s = fs_find_next((dev), 0, state, 1, &i);	\
 	     s != NULL;					\
-	     s = fs_find_next((dev), i + 1, state, &i))
+	     s = fs_find_next((dev), i + 1, state, 1, &i))
 
 /**
  * Iterator construct over fail-safe sub-devices:
@@ -262,7 +277,7 @@  int failsafe_eth_lsc_event_callback(uint16_t port_id,
  * dev: (struct rte_eth_dev *), fail-safe ethdev
  */
 #define FOREACH_SUBDEV(s, i, dev)			\
-	FOREACH_SUBDEV_STATE(s, i, dev, DEV_UNDEFINED)
+	FOREACH_SUBDEV_STATE_UNSAFE(s, i, dev, DEV_UNDEFINED)
 
 /* dev: (struct rte_eth_dev *) fail-safe device */
 #define PREFERRED_SUBDEV(dev) \
@@ -328,6 +343,7 @@  int failsafe_eth_lsc_event_callback(uint16_t port_id,
 fs_find_next(struct rte_eth_dev *dev,
 	     uint8_t sid,
 	     enum dev_state min_state,
+	     uint8_t check_remove,
 	     uint8_t *sid_out)
 {
 	struct sub_device *subs;
@@ -336,8 +352,12 @@  int failsafe_eth_lsc_event_callback(uint16_t port_id,
 	subs = PRIV(dev)->subs;
 	tail = PRIV(dev)->subs_tail;
 	while (sid < tail) {
-		if (subs[sid].state >= min_state)
-			break;
+		if (subs[sid].state >= min_state) {
+			if (check_remove == 0)
+				break;
+			if (PRIV(dev)->subs[sid].remove == 0)
+				break;
+		}
 		sid++;
 	}
 	*sid_out = sid;
@@ -376,7 +396,7 @@  int failsafe_eth_lsc_event_callback(uint16_t port_id,
 		uint8_t i;
 
 		/* Using acceptable device */
-		FOREACH_SUBDEV_STATE(sdev, i, dev, req_state) {
+		FOREACH_SUBDEV_STATE_UNSAFE(sdev, i, dev, req_state) {
 			if (sdev == banned)
 				continue;
 			DEBUG("Switching tx_dev to sub_device %d",