[2/2] net/mlx5: fix GENEVE parser cleanup

Message ID 20250313083351.25559-2-getelson@nvidia.com (mailing list archive)
State Accepted, archived
Delegated to: Raslan Darawsheh
Headers
Series [1/2] common/mlx5: add device duplication function |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/intel-Functional success Functional PASS
ci/github-robot: build success github build: passed
ci/iol-marvell-Functional success Functional Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-abi-testing success Testing PASS
ci/iol-unit-arm64-testing success Testing PASS
ci/iol-unit-amd64-testing success Testing PASS
ci/iol-compile-arm64-testing success Testing PASS
ci/iol-compile-amd64-testing success Testing PASS
ci/iol-sample-apps-testing success Testing PASS

Commit Message

Gregory Etelson March 13, 2025, 8:33 a.m. UTC
From: Michael Baum <michaelba@nvidia.com>

The GENEVE parser is shared across ports on the same physical device. It
is created once for the first port and increments a reference counter
for each additional port. The parser is created using the InfiniBand
(IBV) context (CTX) of the first port, and cleanup should use the same
context.

Previously, if the port owning the context closed while another port
still used the parser, the port closure would fail due to the shared
parser dependency.

This patch addresses the issue by changing the approach: the physical
device now creates its own distinct context by importing the context
from the first initialized port. This ensures that parser creation and
cleanup use a consistent context, independent of individual port
lifecycles.

Fixes: f5177bdc8b76 ("net/mlx5: add GENEVE TLV options parser API")
Cc: michaelba@nvidia.com

Signed-off-by: Michael Baum <michaelba@nvidia.com>
---
 drivers/net/mlx5/mlx5.c             | 21 ++++++------
 drivers/net/mlx5/mlx5.h             |  2 +-
 drivers/net/mlx5/mlx5_flow.h        |  2 --
 drivers/net/mlx5/mlx5_flow_geneve.c | 50 +++--------------------------
 4 files changed, 18 insertions(+), 57 deletions(-)
  

Comments

Matan Azrad March 17, 2025, 10:48 a.m. UTC | #1
> From: Michael Baum <michaelba@nvidia.com>
> 
> The GENEVE parser is shared across ports on the same physical device. It is
> created once for the first port and increments a reference counter for each
> additional port. The parser is created using the InfiniBand
> (IBV) context (CTX) of the first port, and cleanup should use the same context.
> 
> Previously, if the port owning the context closed while another port still used
> the parser, the port closure would fail due to the shared parser dependency.
> 
> This patch addresses the issue by changing the approach: the physical device
> now creates its own distinct context by importing the context from the first
> initialized port. This ensures that parser creation and cleanup use a consistent
> context, independent of individual port lifecycles.
> 
> Fixes: f5177bdc8b76 ("net/mlx5: add GENEVE TLV options parser API")
> Cc: michaelba@nvidia.com
> 
> Signed-off-by: Michael Baum <michaelba@nvidia.com>
Acked-by: Matan Azrad <matan@nvidia.com>


> ---
>  drivers/net/mlx5/mlx5.c             | 21 ++++++------
>  drivers/net/mlx5/mlx5.h             |  2 +-
>  drivers/net/mlx5/mlx5_flow.h        |  2 --
>  drivers/net/mlx5/mlx5_flow_geneve.c | 50 +++--------------------------
>  4 files changed, 18 insertions(+), 57 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> 0f49cb5e5b..a16568ff22 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -1724,6 +1724,16 @@ mlx5_get_physical_device(struct
> mlx5_common_device *cdev)
>  		rte_errno = ENOMEM;
>  		return NULL;
>  	}
> +	/*
> +	 * The same CTX create the physical device objects should destroy
> them.
> +	 * Since we can't be sure it will be done by same CTX, we prepare for
> +	 * the physical device a special CTX used by objects creation.
> +	 */
> +	phdev->ctx = mlx5_os_get_physical_device_ctx(cdev);
> +	if (!phdev->ctx) {
> +		mlx5_free(phdev);
> +		return NULL;
> +	}
>  	phdev->guid = attr->system_image_guid;
>  	phdev->refcnt = 1;
>  	LIST_INSERT_HEAD(&phdev_list, phdev, next); @@ -1767,6 +1777,8
> @@ mlx5_physical_device_destroy(struct mlx5_physical_device *phdev)
>  		return;
>  	/* Remove physical device from the global device list. */
>  	LIST_REMOVE(phdev, next);
> +	MLX5_ASSERT(phdev->ctx);
> +	claim_zero(mlx5_glue->close_device(phdev->ctx));
>  	mlx5_free(phdev);
>  }
> 
> @@ -2323,15 +2335,6 @@ mlx5_dev_close(struct rte_eth_dev *dev)
>  		rte_errno = EBUSY;
>  		return -EBUSY;
>  	}
> -#ifdef HAVE_MLX5_HWS_SUPPORT
> -	/* Check if shared GENEVE options created on context being closed. */
> -	ret = mlx5_geneve_tlv_options_check_busy(priv);
> -	if (ret) {
> -		DRV_LOG(ERR, "port %u maintains shared GENEVE TLV
> options",
> -			dev->data->port_id);
> -		return ret;
> -	}
> -#endif
>  	DRV_LOG(DEBUG, "port %u closing device \"%s\"",
>  		dev->data->port_id, sh->ibdev_name);
>  	/*
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> 6df99c25e2..0194887a8b 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -1527,7 +1527,7 @@ struct mlx5_common_nic_config {
>   */
>  struct mlx5_physical_device {
>  	LIST_ENTRY(mlx5_physical_device) next;
> -	struct mlx5_dev_ctx_shared *sh; /* Created on sherd context. */
> +	void *ctx; /* CTX for creation of options. */
>  	uint64_t guid; /* System image guid, the uniq ID of physical device. */
>  	struct mlx5_geneve_tlv_options *tlv_options;
>  	struct mlx5_common_nic_config config;
> diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
> index a5bde158ca..9816ed9238 100644
> --- a/drivers/net/mlx5/mlx5_flow.h
> +++ b/drivers/net/mlx5/mlx5_flow.h
> @@ -3469,8 +3469,6 @@ flow_hw_resource_release(struct rte_eth_dev
> *dev);  int  mlx5_geneve_tlv_options_destroy(struct mlx5_geneve_tlv_options
> *options,
>  				struct mlx5_physical_device *phdev); -int -
> mlx5_geneve_tlv_options_check_busy(struct mlx5_priv *priv);  void
> flow_hw_rxq_flag_set(struct rte_eth_dev *dev, bool enable);  int
> flow_dv_action_validate(struct rte_eth_dev *dev, diff --git
> a/drivers/net/mlx5/mlx5_flow_geneve.c
> b/drivers/net/mlx5/mlx5_flow_geneve.c
> index 6bf53e1270..4d57bb763f 100644
> --- a/drivers/net/mlx5/mlx5_flow_geneve.c
> +++ b/drivers/net/mlx5/mlx5_flow_geneve.c
> @@ -590,8 +590,8 @@ mlx5_geneve_tlv_option_copy(struct
> rte_pmd_mlx5_geneve_tlv *dst,
>  /**
>   * Create list of GENEVE TLV options according to user configuration list.
>   *
> - * @param sh
> - *   Shared context the options are being created on.
> + * @param ctx
> + *   Context returned from mlx5 open_device() glue function.
>   * @param tlv_list
>   *   A list of GENEVE TLV options to create parser for them.
>   * @param nb_options
> @@ -604,7 +604,7 @@ mlx5_geneve_tlv_option_copy(struct
> rte_pmd_mlx5_geneve_tlv *dst,
>   *   NULL otherwise and rte_errno is set.
>   */
>  static struct mlx5_geneve_tlv_options * -
> mlx5_geneve_tlv_options_create(struct mlx5_dev_ctx_shared *sh,
> +mlx5_geneve_tlv_options_create(void *ctx,
>  			       const struct rte_pmd_mlx5_geneve_tlv tlv_list[],
>  			       uint8_t nb_options, uint8_t sample_id)  { @@ -
> 625,7 +625,7 @@ mlx5_geneve_tlv_options_create(struct
> mlx5_dev_ctx_shared *sh,
>  	}
>  	for (i = 0; i < nb_options; ++i) {
>  		spec = &tlv_list[i];
> -		ret = mlx5_geneve_tlv_option_create(sh->cdev->ctx, spec,
> +		ret = mlx5_geneve_tlv_option_create(ctx, spec,
>  						    &options->options[i],
> sample_id);
>  		if (ret < 0)
>  			goto error;
> @@ -633,8 +633,6 @@ mlx5_geneve_tlv_options_create(struct
> mlx5_dev_ctx_shared *sh,
>  		data_mask = options->buffer + i *
> MAX_GENEVE_OPTION_DATA_SIZE;
>  		mlx5_geneve_tlv_option_copy(&options->spec[i], spec,
> data_mask);
>  	}
> -	MLX5_ASSERT(sh->phdev->sh == NULL);
> -	sh->phdev->sh = sh;
>  	options->nb_options = nb_options;
>  	options->refcnt = 1;
>  	return options;
> @@ -676,41 +674,9 @@ mlx5_geneve_tlv_options_destroy(struct
> mlx5_geneve_tlv_options *options,
>  	}
>  	mlx5_free(options);
>  	phdev->tlv_options = NULL;
> -	phdev->sh = NULL;
>  	return 0;
>  }
> 
> -/**
> - * Check if GENEVE TLV options are hosted on the current port
> - * and the port can be closed
> - *
> - * @param priv
> - *   Device private data.
> - *
> - * @return
> - *   0 on success, a negative EBUSY and rte_errno is set.
> - */
> -int
> -mlx5_geneve_tlv_options_check_busy(struct mlx5_priv *priv) -{
> -	struct mlx5_physical_device *phdev =
> mlx5_get_locked_physical_device(priv);
> -	struct mlx5_dev_ctx_shared *sh = priv->sh;
> -
> -	if (!phdev || phdev->sh != sh) {
> -		mlx5_unlock_physical_device();
> -		return 0;
> -	}
> -	if (!sh->phdev->tlv_options || sh->phdev->tlv_options->refcnt == 1) {
> -		/* Mark port as being closed one */
> -		sh->phdev->sh = NULL;
> -		mlx5_unlock_physical_device();
> -		return 0;
> -	}
> -	mlx5_unlock_physical_device();
> -	rte_errno = EBUSY;
> -	return -EBUSY;
> -}
> -
>  /**
>   * Validate GENEVE TLV option user request structure.
>   *
> @@ -955,18 +921,12 @@ mlx5_geneve_tlv_parser_create(uint16_t port_id,
>  			rte_errno = EEXIST;
>  			return NULL;
>  		}
> -		if (phdev->sh == NULL) {
> -			mlx5_unlock_physical_device();
> -			DRV_LOG(ERR, "GENEVE TLV options are hosted on
> port being closed.");
> -			rte_errno = EBUSY;
> -			return NULL;
> -		}
>  		/* Use existing options. */
>  		options->refcnt++;
>  		goto exit;
>  	}
>  	/* Create GENEVE TLV options for this physical device. */
> -	options = mlx5_geneve_tlv_options_create(priv->sh, tlv_list,
> nb_options, sample_id);
> +	options = mlx5_geneve_tlv_options_create(phdev->ctx, tlv_list,
> +nb_options, sample_id);
>  	if (!options) {
>  		mlx5_unlock_physical_device();
>  		return NULL;
> --
> 2.45.2
  

Patch

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 0f49cb5e5b..a16568ff22 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1724,6 +1724,16 @@  mlx5_get_physical_device(struct mlx5_common_device *cdev)
 		rte_errno = ENOMEM;
 		return NULL;
 	}
+	/*
+	 * The same CTX create the physical device objects should destroy them.
+	 * Since we can't be sure it will be done by same CTX, we prepare for
+	 * the physical device a special CTX used by objects creation.
+	 */
+	phdev->ctx = mlx5_os_get_physical_device_ctx(cdev);
+	if (!phdev->ctx) {
+		mlx5_free(phdev);
+		return NULL;
+	}
 	phdev->guid = attr->system_image_guid;
 	phdev->refcnt = 1;
 	LIST_INSERT_HEAD(&phdev_list, phdev, next);
@@ -1767,6 +1777,8 @@  mlx5_physical_device_destroy(struct mlx5_physical_device *phdev)
 		return;
 	/* Remove physical device from the global device list. */
 	LIST_REMOVE(phdev, next);
+	MLX5_ASSERT(phdev->ctx);
+	claim_zero(mlx5_glue->close_device(phdev->ctx));
 	mlx5_free(phdev);
 }
 
@@ -2323,15 +2335,6 @@  mlx5_dev_close(struct rte_eth_dev *dev)
 		rte_errno = EBUSY;
 		return -EBUSY;
 	}
-#ifdef HAVE_MLX5_HWS_SUPPORT
-	/* Check if shared GENEVE options created on context being closed. */
-	ret = mlx5_geneve_tlv_options_check_busy(priv);
-	if (ret) {
-		DRV_LOG(ERR, "port %u maintains shared GENEVE TLV options",
-			dev->data->port_id);
-		return ret;
-	}
-#endif
 	DRV_LOG(DEBUG, "port %u closing device \"%s\"",
 		dev->data->port_id, sh->ibdev_name);
 	/*
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 6df99c25e2..0194887a8b 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -1527,7 +1527,7 @@  struct mlx5_common_nic_config {
  */
 struct mlx5_physical_device {
 	LIST_ENTRY(mlx5_physical_device) next;
-	struct mlx5_dev_ctx_shared *sh; /* Created on sherd context. */
+	void *ctx; /* CTX for creation of options. */
 	uint64_t guid; /* System image guid, the uniq ID of physical device. */
 	struct mlx5_geneve_tlv_options *tlv_options;
 	struct mlx5_common_nic_config config;
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index a5bde158ca..9816ed9238 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -3469,8 +3469,6 @@  flow_hw_resource_release(struct rte_eth_dev *dev);
 int
 mlx5_geneve_tlv_options_destroy(struct mlx5_geneve_tlv_options *options,
 				struct mlx5_physical_device *phdev);
-int
-mlx5_geneve_tlv_options_check_busy(struct mlx5_priv *priv);
 void
 flow_hw_rxq_flag_set(struct rte_eth_dev *dev, bool enable);
 int flow_dv_action_validate(struct rte_eth_dev *dev,
diff --git a/drivers/net/mlx5/mlx5_flow_geneve.c b/drivers/net/mlx5/mlx5_flow_geneve.c
index 6bf53e1270..4d57bb763f 100644
--- a/drivers/net/mlx5/mlx5_flow_geneve.c
+++ b/drivers/net/mlx5/mlx5_flow_geneve.c
@@ -590,8 +590,8 @@  mlx5_geneve_tlv_option_copy(struct rte_pmd_mlx5_geneve_tlv *dst,
 /**
  * Create list of GENEVE TLV options according to user configuration list.
  *
- * @param sh
- *   Shared context the options are being created on.
+ * @param ctx
+ *   Context returned from mlx5 open_device() glue function.
  * @param tlv_list
  *   A list of GENEVE TLV options to create parser for them.
  * @param nb_options
@@ -604,7 +604,7 @@  mlx5_geneve_tlv_option_copy(struct rte_pmd_mlx5_geneve_tlv *dst,
  *   NULL otherwise and rte_errno is set.
  */
 static struct mlx5_geneve_tlv_options *
-mlx5_geneve_tlv_options_create(struct mlx5_dev_ctx_shared *sh,
+mlx5_geneve_tlv_options_create(void *ctx,
 			       const struct rte_pmd_mlx5_geneve_tlv tlv_list[],
 			       uint8_t nb_options, uint8_t sample_id)
 {
@@ -625,7 +625,7 @@  mlx5_geneve_tlv_options_create(struct mlx5_dev_ctx_shared *sh,
 	}
 	for (i = 0; i < nb_options; ++i) {
 		spec = &tlv_list[i];
-		ret = mlx5_geneve_tlv_option_create(sh->cdev->ctx, spec,
+		ret = mlx5_geneve_tlv_option_create(ctx, spec,
 						    &options->options[i], sample_id);
 		if (ret < 0)
 			goto error;
@@ -633,8 +633,6 @@  mlx5_geneve_tlv_options_create(struct mlx5_dev_ctx_shared *sh,
 		data_mask = options->buffer + i * MAX_GENEVE_OPTION_DATA_SIZE;
 		mlx5_geneve_tlv_option_copy(&options->spec[i], spec, data_mask);
 	}
-	MLX5_ASSERT(sh->phdev->sh == NULL);
-	sh->phdev->sh = sh;
 	options->nb_options = nb_options;
 	options->refcnt = 1;
 	return options;
@@ -676,41 +674,9 @@  mlx5_geneve_tlv_options_destroy(struct mlx5_geneve_tlv_options *options,
 	}
 	mlx5_free(options);
 	phdev->tlv_options = NULL;
-	phdev->sh = NULL;
 	return 0;
 }
 
-/**
- * Check if GENEVE TLV options are hosted on the current port
- * and the port can be closed
- *
- * @param priv
- *   Device private data.
- *
- * @return
- *   0 on success, a negative EBUSY and rte_errno is set.
- */
-int
-mlx5_geneve_tlv_options_check_busy(struct mlx5_priv *priv)
-{
-	struct mlx5_physical_device *phdev = mlx5_get_locked_physical_device(priv);
-	struct mlx5_dev_ctx_shared *sh = priv->sh;
-
-	if (!phdev || phdev->sh != sh) {
-		mlx5_unlock_physical_device();
-		return 0;
-	}
-	if (!sh->phdev->tlv_options || sh->phdev->tlv_options->refcnt == 1) {
-		/* Mark port as being closed one */
-		sh->phdev->sh = NULL;
-		mlx5_unlock_physical_device();
-		return 0;
-	}
-	mlx5_unlock_physical_device();
-	rte_errno = EBUSY;
-	return -EBUSY;
-}
-
 /**
  * Validate GENEVE TLV option user request structure.
  *
@@ -955,18 +921,12 @@  mlx5_geneve_tlv_parser_create(uint16_t port_id,
 			rte_errno = EEXIST;
 			return NULL;
 		}
-		if (phdev->sh == NULL) {
-			mlx5_unlock_physical_device();
-			DRV_LOG(ERR, "GENEVE TLV options are hosted on port being closed.");
-			rte_errno = EBUSY;
-			return NULL;
-		}
 		/* Use existing options. */
 		options->refcnt++;
 		goto exit;
 	}
 	/* Create GENEVE TLV options for this physical device. */
-	options = mlx5_geneve_tlv_options_create(priv->sh, tlv_list, nb_options, sample_id);
+	options = mlx5_geneve_tlv_options_create(phdev->ctx, tlv_list, nb_options, sample_id);
 	if (!options) {
 		mlx5_unlock_physical_device();
 		return NULL;