[v2] net/mlx5: fix device removal handler for multiport device

Message ID 1557649949-16370-1-git-send-email-viacheslavo@mellanox.com (mailing list archive)
State Accepted, archived
Delegated to: Shahaf Shuler
Headers
Series [v2] net/mlx5: fix device removal handler for multiport device |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK
ci/mellanox-Performance-Testing success Performance Testing PASS
ci/intel-Performance-Testing success Performance Testing PASS

Commit Message

Slava Ovsiienko May 12, 2019, 8:32 a.m. UTC
  IBV_EVENT_DEVICE_FATAL event is generated by the driver once for
the entire multiport Infiniband device, not for each existing ports.
The port index is zero and it causes dropping the device removal
event. We should invoke the removal event processing routine
for each port we have installed handler for.

Fixes: 028b2a28c3cb ("net/mlx5: update event handler for multiport IB devices")

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
v2: - address comments
    - more detailed debug messages in the event handler
    - removed port specific IBV_EVENT_DEVICE_FATAL handling code

v1: http://patches.dpdk.org/patch/53371/

 drivers/net/mlx5/mlx5_ethdev.c | 77 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 62 insertions(+), 15 deletions(-)
  

Comments

Shahaf Shuler May 12, 2019, 12:15 p.m. UTC | #1
Sunday, May 12, 2019 11:32 AM, Viacheslav Ovsiienko:
> Subject: [dpdk-dev] [PATCH v2] net/mlx5: fix device removal handler for
> multiport device
> 
> IBV_EVENT_DEVICE_FATAL event is generated by the driver once for the
> entire multiport Infiniband device, not for each existing ports.
> The port index is zero and it causes dropping the device removal event. We
> should invoke the removal event processing routine for each port we have
> installed handler for.
> 
> Fixes: 028b2a28c3cb ("net/mlx5: update event handler for multiport IB
> devices")
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>

Acked-by: Shahaf Shuler <shahafs@mellanox.com>

Thomas, Ferruh,
This one is a critical fix for mlx5. w/o it will break the support for failsafe at azure. 

Can you consider to integrate it?

> ---
> v2: - address comments
>     - more detailed debug messages in the event handler
>     - removed port specific IBV_EVENT_DEVICE_FATAL handling code
> 
> v1:
> https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> es.dpdk.org%2Fpatch%2F53371%2F&amp;data=02%7C01%7Cshahafs%40mel
> lanox.com%7C46fcede947654c45106e08d6d6b462e5%7Ca652971c7d2e4d9ba
> 6a4d149256f461b%7C0%7C0%7C636932467570850420&amp;sdata=%2FN%2B
> D0OWf5y0hgtlvWj7om9qZrQPPIbmGXDIfsgqeUtY%3D&amp;reserved=0
> 
>  drivers/net/mlx5/mlx5_ethdev.c | 77
> ++++++++++++++++++++++++++++++++++--------
>  1 file changed, 62 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5_ethdev.c
> b/drivers/net/mlx5/mlx5_ethdev.c index 80ee98f..a8a7ece 100644
> --- a/drivers/net/mlx5/mlx5_ethdev.c
> +++ b/drivers/net/mlx5/mlx5_ethdev.c
> @@ -1116,6 +1116,35 @@ int mlx5_fw_version_get(struct rte_eth_dev
> *dev, char *fw_ver, size_t fw_size)  }
> 
>  /**
> + * Handle asynchronous removal event for entire multiport device.
> + *
> + * @param sh
> + *   Infiniband device shared context.
> + */
> +static void
> +mlx5_dev_interrupt_device_fatal(struct mlx5_ibv_shared *sh) {
> +	uint32_t i;
> +
> +	for (i = 0; i < sh->max_port; ++i) {
> +		struct rte_eth_dev *dev;
> +
> +		if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
> +			/*
> +			 * Or not existing port either no
> +			 * handler installed for this port.
> +			 */
> +			continue;
> +		}
> +		dev = &rte_eth_devices[sh->port[i].ih_port_id];
> +		assert(dev);
> +		if (dev->data->dev_conf.intr_conf.rmv)
> +			_rte_eth_dev_callback_process
> +				(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
> +	}
> +}
> +
> +/**
>   * Handle shared asynchronous events the NIC (removal event
>   * and link status change). Supports multiport IB device.
>   *
> @@ -1137,21 +1166,46 @@ int mlx5_fw_version_get(struct rte_eth_dev
> *dev, char *fw_ver, size_t fw_size)
>  			break;
>  		/* Retrieve and check IB port index. */
>  		tmp = (uint32_t)event.element.port_num;
> -		assert(tmp && (tmp <= sh->max_port));
> -		if (!tmp ||
> -		    tmp > sh->max_port ||
> -		    sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
> +		if (!tmp && event.event_type ==
> IBV_EVENT_DEVICE_FATAL) {
>  			/*
> -			 * Invalid IB port index or no handler
> -			 * installed for this port.
> +			 * The DEVICE_FATAL event is called once for
> +			 * entire device without port specifying.
> +			 * We should notify all existing ports.
>  			 */
>  			mlx5_glue->ack_async_event(&event);
> +			mlx5_dev_interrupt_device_fatal(sh);
> +			continue;
> +		}
> +		assert(tmp && (tmp <= sh->max_port));
> +		if (!tmp) {
> +			/* Unsupported devive level event. */
> +			mlx5_glue->ack_async_event(&event);
> +			DRV_LOG(DEBUG,
> +				"unsupported common event (type %d)",
> +				event.event_type);
> +			continue;
> +		}
> +		if (tmp > sh->max_port) {
> +			/* Invalid IB port index. */
> +			mlx5_glue->ack_async_event(&event);
> +			DRV_LOG(DEBUG,
> +				"cannot handle an event (type %d)"
> +				"due to invalid IB port index (%u)",
> +				event.event_type, tmp);
> +			continue;
> +		}
> +		if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
> +			/* No handler installed. */
> +			mlx5_glue->ack_async_event(&event);
> +			DRV_LOG(DEBUG,
> +				"cannot handle an event (type %d)"
> +				"due to no handler installed for port %u",
> +				event.event_type, tmp);
>  			continue;
>  		}
>  		/* Retrieve ethernet device descriptor. */
>  		tmp = sh->port[tmp - 1].ih_port_id;
>  		dev = &rte_eth_devices[tmp];
> -		tmp = 0;
>  		assert(dev);
>  		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
>  		     event.event_type == IBV_EVENT_PORT_ERR) && @@ -
> 1165,15 +1219,8 @@ int mlx5_fw_version_get(struct rte_eth_dev *dev, char
> *fw_ver, size_t fw_size)
>  				(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
>  			continue;
>  		}
> -		if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
> -		    dev->data->dev_conf.intr_conf.rmv) {
> -			mlx5_glue->ack_async_event(&event);
> -			_rte_eth_dev_callback_process
> -				(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
> -			continue;
> -		}
>  		DRV_LOG(DEBUG,
> -			"port %u event type %d on not handled",
> +			"port %u cannot handle an unknown event (type
> %d)",
>  			dev->data->port_id, event.event_type);
>  		mlx5_glue->ack_async_event(&event);
>  	}
> --
> 1.8.3.1
  
Shahaf Shuler May 12, 2019, 5:17 p.m. UTC | #2
Sunday, May 12, 2019 3:15 PM, Shahaf Shuler:
> Subject: Re: [dpdk-dev] [PATCH v2] net/mlx5: fix device removal handler for
> multiport device
> 
> Sunday, May 12, 2019 11:32 AM, Viacheslav Ovsiienko:
> > Subject: [dpdk-dev] [PATCH v2] net/mlx5: fix device removal handler
> > for multiport device
> >
> > IBV_EVENT_DEVICE_FATAL event is generated by the driver once for the
> > entire multiport Infiniband device, not for each existing ports.
> > The port index is zero and it causes dropping the device removal
> > event. We should invoke the removal event processing routine for each
> > port we have installed handler for.
> >
> > Fixes: 028b2a28c3cb ("net/mlx5: update event handler for multiport IB
> > devices")
> >
> > Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> 
> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
> 
> Thomas, Ferruh,
> This one is a critical fix for mlx5. w/o it will break the support for failsafe at
> azure.
> 
> Can you consider to integrate it?

Applied it also to next-net-mlx, thanks. 

> 
> > ---
> > v2: - address comments
> >     - more detailed debug messages in the event handler
> >     - removed port specific IBV_EVENT_DEVICE_FATAL handling code
> >
> > v1:
> >
> https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> >
> es.dpdk.org%2Fpatch%2F53371%2F&amp;data=02%7C01%7Cshahafs%40mel
> >
> lanox.com%7C46fcede947654c45106e08d6d6b462e5%7Ca652971c7d2e4d9ba
> >
> 6a4d149256f461b%7C0%7C0%7C636932467570850420&amp;sdata=%2FN%2B
> > D0OWf5y0hgtlvWj7om9qZrQPPIbmGXDIfsgqeUtY%3D&amp;reserved=0
> >
> >  drivers/net/mlx5/mlx5_ethdev.c | 77
> > ++++++++++++++++++++++++++++++++++--------
> >  1 file changed, 62 insertions(+), 15 deletions(-)
> >
> > diff --git a/drivers/net/mlx5/mlx5_ethdev.c
> > b/drivers/net/mlx5/mlx5_ethdev.c index 80ee98f..a8a7ece 100644
> > --- a/drivers/net/mlx5/mlx5_ethdev.c
> > +++ b/drivers/net/mlx5/mlx5_ethdev.c
> > @@ -1116,6 +1116,35 @@ int mlx5_fw_version_get(struct rte_eth_dev
> > *dev, char *fw_ver, size_t fw_size)  }
> >
> >  /**
> > + * Handle asynchronous removal event for entire multiport device.
> > + *
> > + * @param sh
> > + *   Infiniband device shared context.
> > + */
> > +static void
> > +mlx5_dev_interrupt_device_fatal(struct mlx5_ibv_shared *sh) {
> > +	uint32_t i;
> > +
> > +	for (i = 0; i < sh->max_port; ++i) {
> > +		struct rte_eth_dev *dev;
> > +
> > +		if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
> > +			/*
> > +			 * Or not existing port either no
> > +			 * handler installed for this port.
> > +			 */
> > +			continue;
> > +		}
> > +		dev = &rte_eth_devices[sh->port[i].ih_port_id];
> > +		assert(dev);
> > +		if (dev->data->dev_conf.intr_conf.rmv)
> > +			_rte_eth_dev_callback_process
> > +				(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
> > +	}
> > +}
> > +
> > +/**
> >   * Handle shared asynchronous events the NIC (removal event
> >   * and link status change). Supports multiport IB device.
> >   *
> > @@ -1137,21 +1166,46 @@ int mlx5_fw_version_get(struct rte_eth_dev
> > *dev, char *fw_ver, size_t fw_size)
> >  			break;
> >  		/* Retrieve and check IB port index. */
> >  		tmp = (uint32_t)event.element.port_num;
> > -		assert(tmp && (tmp <= sh->max_port));
> > -		if (!tmp ||
> > -		    tmp > sh->max_port ||
> > -		    sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
> > +		if (!tmp && event.event_type ==
> > IBV_EVENT_DEVICE_FATAL) {
> >  			/*
> > -			 * Invalid IB port index or no handler
> > -			 * installed for this port.
> > +			 * The DEVICE_FATAL event is called once for
> > +			 * entire device without port specifying.
> > +			 * We should notify all existing ports.
> >  			 */
> >  			mlx5_glue->ack_async_event(&event);
> > +			mlx5_dev_interrupt_device_fatal(sh);
> > +			continue;
> > +		}
> > +		assert(tmp && (tmp <= sh->max_port));
> > +		if (!tmp) {
> > +			/* Unsupported devive level event. */
> > +			mlx5_glue->ack_async_event(&event);
> > +			DRV_LOG(DEBUG,
> > +				"unsupported common event (type %d)",
> > +				event.event_type);
> > +			continue;
> > +		}
> > +		if (tmp > sh->max_port) {
> > +			/* Invalid IB port index. */
> > +			mlx5_glue->ack_async_event(&event);
> > +			DRV_LOG(DEBUG,
> > +				"cannot handle an event (type %d)"
> > +				"due to invalid IB port index (%u)",
> > +				event.event_type, tmp);
> > +			continue;
> > +		}
> > +		if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
> > +			/* No handler installed. */
> > +			mlx5_glue->ack_async_event(&event);
> > +			DRV_LOG(DEBUG,
> > +				"cannot handle an event (type %d)"
> > +				"due to no handler installed for port %u",
> > +				event.event_type, tmp);
> >  			continue;
> >  		}
> >  		/* Retrieve ethernet device descriptor. */
> >  		tmp = sh->port[tmp - 1].ih_port_id;
> >  		dev = &rte_eth_devices[tmp];
> > -		tmp = 0;
> >  		assert(dev);
> >  		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
> >  		     event.event_type == IBV_EVENT_PORT_ERR) && @@ -
> > 1165,15 +1219,8 @@ int mlx5_fw_version_get(struct rte_eth_dev *dev,
> > char *fw_ver, size_t fw_size)
> >  				(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
> >  			continue;
> >  		}
> > -		if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
> > -		    dev->data->dev_conf.intr_conf.rmv) {
> > -			mlx5_glue->ack_async_event(&event);
> > -			_rte_eth_dev_callback_process
> > -				(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
> > -			continue;
> > -		}
> >  		DRV_LOG(DEBUG,
> > -			"port %u event type %d on not handled",
> > +			"port %u cannot handle an unknown event (type
> > %d)",
> >  			dev->data->port_id, event.event_type);
> >  		mlx5_glue->ack_async_event(&event);
> >  	}
> > --
> > 1.8.3.1
  
Thomas Monjalon May 13, 2019, 2:30 p.m. UTC | #3
12/05/2019 19:17, Shahaf Shuler:
> Sunday, May 12, 2019 3:15 PM, Shahaf Shuler:
> > Subject: Re: [dpdk-dev] [PATCH v2] net/mlx5: fix device removal handler for
> > multiport device
> > 
> > Sunday, May 12, 2019 11:32 AM, Viacheslav Ovsiienko:
> > > Subject: [dpdk-dev] [PATCH v2] net/mlx5: fix device removal handler
> > > for multiport device
> > >
> > > IBV_EVENT_DEVICE_FATAL event is generated by the driver once for the
> > > entire multiport Infiniband device, not for each existing ports.
> > > The port index is zero and it causes dropping the device removal
> > > event. We should invoke the removal event processing routine for each
> > > port we have installed handler for.
> > >
> > > Fixes: 028b2a28c3cb ("net/mlx5: update event handler for multiport IB
> > > devices")
> > >
> > > Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> > 
> > Acked-by: Shahaf Shuler <shahafs@mellanox.com>
> > 
> > Thomas, Ferruh,
> > This one is a critical fix for mlx5. w/o it will break the support for failsafe at
> > azure.
> > 
> > Can you consider to integrate it?
> 
> Applied it also to next-net-mlx, thanks. 

Pulled in master
  

Patch

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 80ee98f..a8a7ece 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1116,6 +1116,35 @@  int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
 }
 
 /**
+ * Handle asynchronous removal event for entire multiport device.
+ *
+ * @param sh
+ *   Infiniband device shared context.
+ */
+static void
+mlx5_dev_interrupt_device_fatal(struct mlx5_ibv_shared *sh)
+{
+	uint32_t i;
+
+	for (i = 0; i < sh->max_port; ++i) {
+		struct rte_eth_dev *dev;
+
+		if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
+			/*
+			 * Or not existing port either no
+			 * handler installed for this port.
+			 */
+			continue;
+		}
+		dev = &rte_eth_devices[sh->port[i].ih_port_id];
+		assert(dev);
+		if (dev->data->dev_conf.intr_conf.rmv)
+			_rte_eth_dev_callback_process
+				(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
+	}
+}
+
+/**
  * Handle shared asynchronous events the NIC (removal event
  * and link status change). Supports multiport IB device.
  *
@@ -1137,21 +1166,46 @@  int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
 			break;
 		/* Retrieve and check IB port index. */
 		tmp = (uint32_t)event.element.port_num;
-		assert(tmp && (tmp <= sh->max_port));
-		if (!tmp ||
-		    tmp > sh->max_port ||
-		    sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
+		if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) {
 			/*
-			 * Invalid IB port index or no handler
-			 * installed for this port.
+			 * The DEVICE_FATAL event is called once for
+			 * entire device without port specifying.
+			 * We should notify all existing ports.
 			 */
 			mlx5_glue->ack_async_event(&event);
+			mlx5_dev_interrupt_device_fatal(sh);
+			continue;
+		}
+		assert(tmp && (tmp <= sh->max_port));
+		if (!tmp) {
+			/* Unsupported devive level event. */
+			mlx5_glue->ack_async_event(&event);
+			DRV_LOG(DEBUG,
+				"unsupported common event (type %d)",
+				event.event_type);
+			continue;
+		}
+		if (tmp > sh->max_port) {
+			/* Invalid IB port index. */
+			mlx5_glue->ack_async_event(&event);
+			DRV_LOG(DEBUG,
+				"cannot handle an event (type %d)"
+				"due to invalid IB port index (%u)",
+				event.event_type, tmp);
+			continue;
+		}
+		if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
+			/* No handler installed. */
+			mlx5_glue->ack_async_event(&event);
+			DRV_LOG(DEBUG,
+				"cannot handle an event (type %d)"
+				"due to no handler installed for port %u",
+				event.event_type, tmp);
 			continue;
 		}
 		/* Retrieve ethernet device descriptor. */
 		tmp = sh->port[tmp - 1].ih_port_id;
 		dev = &rte_eth_devices[tmp];
-		tmp = 0;
 		assert(dev);
 		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
 		     event.event_type == IBV_EVENT_PORT_ERR) &&
@@ -1165,15 +1219,8 @@  int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
 				(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
 			continue;
 		}
-		if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
-		    dev->data->dev_conf.intr_conf.rmv) {
-			mlx5_glue->ack_async_event(&event);
-			_rte_eth_dev_callback_process
-				(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
-			continue;
-		}
 		DRV_LOG(DEBUG,
-			"port %u event type %d on not handled",
+			"port %u cannot handle an unknown event (type %d)",
 			dev->data->port_id, event.event_type);
 		mlx5_glue->ack_async_event(&event);
 	}