[1/2] common/mlx5: update log format after devx_general_cmd error

Message ID 20220608115826.11783-1-getelson@nvidia.com (mailing list archive)
State Accepted, archived
Delegated to: Raslan Darawsheh
Headers
Series [1/2] common/mlx5: update log format after devx_general_cmd error |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Gregory Etelson June 8, 2022, 11:58 a.m. UTC
  Application can fetch syndrome value after FW operation failure
starting from Mellanox OFED-5.6.
The patch updates log data issued after devx_general_cmd error.

Signed-off-by: Gregory Etelson <getelson@nvidia.com>
Acked-by: Matan Azrad <matan@nvidia.com>
---
 drivers/common/mlx5/mlx5_devx_cmds.c | 103 ++++++++++++---------------
 1 file changed, 44 insertions(+), 59 deletions(-)
  

Comments

Raslan Darawsheh June 14, 2022, 3:02 p.m. UTC | #1
Hi,

> -----Original Message-----
> From: Gregory Etelson <getelson@nvidia.com>
> Sent: Wednesday, June 8, 2022 2:58 PM
> To: dev@dpdk.org; Gregory Etelson <getelson@nvidia.com>
> Cc: Raslan Darawsheh <rasland@nvidia.com>; Matan Azrad
> <matan@nvidia.com>; Slava Ovsiienko <viacheslavo@nvidia.com>
> Subject: [PATCH 1/2] common/mlx5: update log format after
> devx_general_cmd error
> 
> Application can fetch syndrome value after FW operation failure
> starting from Mellanox OFED-5.6.
> The patch updates log data issued after devx_general_cmd error.
> 
> Signed-off-by: Gregory Etelson <getelson@nvidia.com>
> Acked-by: Matan Azrad <matan@nvidia.com>

Series applied to next-net-mlx,

Kindest regards,
Raslan Darawsheh
  
David Marchand Nov. 8, 2022, 12:46 p.m. UTC | #2
On Wed, Jun 8, 2022 at 1:58 PM Gregory Etelson <getelson@nvidia.com> wrote:
>
> Application can fetch syndrome value after FW operation failure
> starting from Mellanox OFED-5.6.
> The patch updates log data issued after devx_general_cmd error.
>
> Signed-off-by: Gregory Etelson <getelson@nvidia.com>
> Acked-by: Matan Azrad <matan@nvidia.com>
> ---
>  drivers/common/mlx5/mlx5_devx_cmds.c | 103 ++++++++++++---------------
>  1 file changed, 44 insertions(+), 59 deletions(-)
>
> diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c b/drivers/common/mlx5/mlx5_devx_cmds.c
> index c6bdbc12bb..bc06aeccc7 100644
> --- a/drivers/common/mlx5/mlx5_devx_cmds.c
> +++ b/drivers/common/mlx5/mlx5_devx_cmds.c
> @@ -13,39 +13,49 @@
>  #include "mlx5_common_log.h"
>  #include "mlx5_malloc.h"
>
> +/* FW writes status value to the OUT buffer at offset 00H */
> +#define MLX5_FW_STATUS(o) MLX5_GET(general_obj_out_cmd_hdr, (o), status)
> +/* FW writes syndrome value to the OUT buffer at offset 04H */
> +#define MLX5_FW_SYNDROME(o) MLX5_GET(general_obj_out_cmd_hdr, (o), syndrome)
> +
> +#define MLX5_DEVX_ERR_RC(x) ((x) > 0 ? -(x) : ((x) < 0 ? (x) : -1))
> +
> +static void
> +mlx5_devx_err_log(void *out, const char *reason,
> +                 const char *param, uint32_t value)
> +{
> +       rte_errno = errno;
> +       if (!param)
> +               DRV_LOG(ERR, "DevX %s failed errno=%d status=%#x syndrome=%#x",
> +                       reason, errno, MLX5_FW_STATUS(out),
> +                       MLX5_FW_SYNDROME(out));
> +       else
> +               DRV_LOG(ERR, "DevX %s %s=%#X failed errno=%d status=%#x syndrome=%#x",
> +                       reason, param, value, errno, MLX5_FW_STATUS(out),
> +                       MLX5_FW_SYNDROME(out));
> +}
> +
>  static void *
>  mlx5_devx_get_hca_cap(void *ctx, uint32_t *in, uint32_t *out,
>                       int *err, uint32_t flags)
>  {
>         const size_t size_in = MLX5_ST_SZ_DW(query_hca_cap_in) * sizeof(int);
>         const size_t size_out = MLX5_ST_SZ_DW(query_hca_cap_out) * sizeof(int);
> -       int status, syndrome, rc;
> +       int rc;
>
> -       if (err)
> -               *err = 0;
>         memset(in, 0, size_in);
>         memset(out, 0, size_out);
>         MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
>         MLX5_SET(query_hca_cap_in, in, op_mod, flags);
>         rc = mlx5_glue->devx_general_cmd(ctx, in, size_in, out, size_out);
> -       if (rc) {
> -               DRV_LOG(ERR,
> -                       "Failed to query devx HCA capabilities func %#02x",
> -                       flags >> 1);
> +       if (rc || MLX5_FW_STATUS(out)) {
> +               mlx5_devx_err_log(out, "HCA capabilities", "func", flags >> 1);
>                 if (err)
> -                       *err = rc > 0 ? -rc : rc;
> -               return NULL;
> -       }
> -       status = MLX5_GET(query_hca_cap_out, out, status);
> -       syndrome = MLX5_GET(query_hca_cap_out, out, syndrome);
> -       if (status) {
> -               DRV_LOG(ERR,
> -                       "Failed to query devx HCA capabilities func %#02x status %x, syndrome = %x",
> -                       flags >> 1, status, syndrome);
> -               if (err)
> -                       *err = -1;
> +                       *err = MLX5_DEVX_ERR_RC(rc);
>                 return NULL;
>         }
> +       if (err)
> +               *err = 0;
>         return MLX5_ADDR_OF(query_hca_cap_out, out, capability);
>  }
>
> @@ -74,7 +84,7 @@ mlx5_devx_cmd_register_read(void *ctx, uint16_t reg_id, uint32_t arg,
>         uint32_t in[MLX5_ST_SZ_DW(access_register_in)]   = {0};
>         uint32_t out[MLX5_ST_SZ_DW(access_register_out) +
>                      MLX5_ACCESS_REGISTER_DATA_DWORD_MAX] = {0};
> -       int status, rc;
> +       int rc;
>
>         MLX5_ASSERT(data && dw_cnt);
>         MLX5_ASSERT(dw_cnt <= MLX5_ACCESS_REGISTER_DATA_DWORD_MAX);
> @@ -91,23 +101,13 @@ mlx5_devx_cmd_register_read(void *ctx, uint16_t reg_id, uint32_t arg,
>         rc = mlx5_glue->devx_general_cmd(ctx, in, sizeof(in), out,
>                                          MLX5_ST_SZ_BYTES(access_register_out) +
>                                          sizeof(uint32_t) * dw_cnt);
> -       if (rc)
> -               goto error;
> -       status = MLX5_GET(access_register_out, out, status);
> -       if (status) {
> -               int syndrome = MLX5_GET(access_register_out, out, syndrome);
> -
> -               DRV_LOG(DEBUG, "Failed to read access NIC register 0x%X, "
> -                              "status %x, syndrome = %x",
> -                              reg_id, status, syndrome);
> -               return -1;
> +       if (rc || MLX5_FW_STATUS(out)) {
> +               mlx5_devx_err_log(out, "read access", "NIC register", reg_id);
> +               return MLX5_DEVX_ERR_RC(rc);

I went back in the mailing list archive to find this change.
This patch makes the pmd spat some scary log messages...

I get this on a system with a CX5 nic (OVS dpdk-latest + 22.11-rc2) :
2022-11-08T12:26:09.218Z|00027|dpdk|ERR|mlx5_common: DevX read access
NIC register=0X9055 failed errno=22 status=0 syndrome=0

Should I be scared?
The system runs fine and nothing else changed in my setup (afaics), so
I'd expect it continues working..

This specific change of level on the "read access" check breaks OVS
system dpdk unit tests, as those tests fail on ERR level logs.
If this is harmless, please send a patch to revert to debug level (and
for other log messages that were updated in this patch if it was
unneeded).


Thanks.
  
Gregory Etelson Nov. 8, 2022, 4:05 p.m. UTC | #3
Hello David,

> > +       if (rc || MLX5_FW_STATUS(out)) {
> > +               mlx5_devx_err_log(out, "read access", "NIC register", reg_id);
> > +               return MLX5_DEVX_ERR_RC(rc);
> 
> I went back in the mailing list archive to find this change.
> This patch makes the pmd spat some scary log messages...
> 
> I get this on a system with a CX5 nic (OVS dpdk-latest + 22.11-rc2) :
> 2022-11-08T12:26:09.218Z|00027|dpdk|ERR|mlx5_common: DevX read
> access
> NIC register=0X9055 failed errno=22 status=0 syndrome=0
> 
> Should I be scared?
> The system runs fine and nothing else changed in my setup (afaics), so
> I'd expect it continues working..

That failure is expected in MLX HBA before ConnectX-6
That it not a functional error in ConnectX-5 and can be ignored.

> 
> This specific change of level on the "read access" check breaks OVS
> system dpdk unit tests, as those tests fail on ERR level logs.
> If this is harmless, please send a patch to revert to debug level (and
> for other log messages that were updated in this patch if it was
> unneeded).
>

I'll send a patch that removes that error message.
 
> 
> Thanks.
> 
> --
> David Marchand
  

Patch

diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c b/drivers/common/mlx5/mlx5_devx_cmds.c
index c6bdbc12bb..bc06aeccc7 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.c
+++ b/drivers/common/mlx5/mlx5_devx_cmds.c
@@ -13,39 +13,49 @@ 
 #include "mlx5_common_log.h"
 #include "mlx5_malloc.h"
 
+/* FW writes status value to the OUT buffer at offset 00H */
+#define MLX5_FW_STATUS(o) MLX5_GET(general_obj_out_cmd_hdr, (o), status)
+/* FW writes syndrome value to the OUT buffer at offset 04H */
+#define MLX5_FW_SYNDROME(o) MLX5_GET(general_obj_out_cmd_hdr, (o), syndrome)
+
+#define MLX5_DEVX_ERR_RC(x) ((x) > 0 ? -(x) : ((x) < 0 ? (x) : -1))
+
+static void
+mlx5_devx_err_log(void *out, const char *reason,
+		  const char *param, uint32_t value)
+{
+	rte_errno = errno;
+	if (!param)
+		DRV_LOG(ERR, "DevX %s failed errno=%d status=%#x syndrome=%#x",
+			reason, errno, MLX5_FW_STATUS(out),
+			MLX5_FW_SYNDROME(out));
+	else
+		DRV_LOG(ERR, "DevX %s %s=%#X failed errno=%d status=%#x syndrome=%#x",
+			reason, param, value, errno, MLX5_FW_STATUS(out),
+			MLX5_FW_SYNDROME(out));
+}
+
 static void *
 mlx5_devx_get_hca_cap(void *ctx, uint32_t *in, uint32_t *out,
 		      int *err, uint32_t flags)
 {
 	const size_t size_in = MLX5_ST_SZ_DW(query_hca_cap_in) * sizeof(int);
 	const size_t size_out = MLX5_ST_SZ_DW(query_hca_cap_out) * sizeof(int);
-	int status, syndrome, rc;
+	int rc;
 
-	if (err)
-		*err = 0;
 	memset(in, 0, size_in);
 	memset(out, 0, size_out);
 	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
 	MLX5_SET(query_hca_cap_in, in, op_mod, flags);
 	rc = mlx5_glue->devx_general_cmd(ctx, in, size_in, out, size_out);
-	if (rc) {
-		DRV_LOG(ERR,
-			"Failed to query devx HCA capabilities func %#02x",
-			flags >> 1);
+	if (rc || MLX5_FW_STATUS(out)) {
+		mlx5_devx_err_log(out, "HCA capabilities", "func", flags >> 1);
 		if (err)
-			*err = rc > 0 ? -rc : rc;
-		return NULL;
-	}
-	status = MLX5_GET(query_hca_cap_out, out, status);
-	syndrome = MLX5_GET(query_hca_cap_out, out, syndrome);
-	if (status) {
-		DRV_LOG(ERR,
-			"Failed to query devx HCA capabilities func %#02x status %x, syndrome = %x",
-			flags >> 1, status, syndrome);
-		if (err)
-			*err = -1;
+			*err = MLX5_DEVX_ERR_RC(rc);
 		return NULL;
 	}
+	if (err)
+		*err = 0;
 	return MLX5_ADDR_OF(query_hca_cap_out, out, capability);
 }
 
@@ -74,7 +84,7 @@  mlx5_devx_cmd_register_read(void *ctx, uint16_t reg_id, uint32_t arg,
 	uint32_t in[MLX5_ST_SZ_DW(access_register_in)]   = {0};
 	uint32_t out[MLX5_ST_SZ_DW(access_register_out) +
 		     MLX5_ACCESS_REGISTER_DATA_DWORD_MAX] = {0};
-	int status, rc;
+	int rc;
 
 	MLX5_ASSERT(data && dw_cnt);
 	MLX5_ASSERT(dw_cnt <= MLX5_ACCESS_REGISTER_DATA_DWORD_MAX);
@@ -91,23 +101,13 @@  mlx5_devx_cmd_register_read(void *ctx, uint16_t reg_id, uint32_t arg,
 	rc = mlx5_glue->devx_general_cmd(ctx, in, sizeof(in), out,
 					 MLX5_ST_SZ_BYTES(access_register_out) +
 					 sizeof(uint32_t) * dw_cnt);
-	if (rc)
-		goto error;
-	status = MLX5_GET(access_register_out, out, status);
-	if (status) {
-		int syndrome = MLX5_GET(access_register_out, out, syndrome);
-
-		DRV_LOG(DEBUG, "Failed to read access NIC register 0x%X, "
-			       "status %x, syndrome = %x",
-			       reg_id, status, syndrome);
-		return -1;
+	if (rc || MLX5_FW_STATUS(out)) {
+		mlx5_devx_err_log(out, "read access", "NIC register", reg_id);
+		return MLX5_DEVX_ERR_RC(rc);
 	}
 	memcpy(data, &out[MLX5_ST_SZ_DW(access_register_out)],
 	       dw_cnt * sizeof(uint32_t));
 	return 0;
-error:
-	rc = (rc > 0) ? -rc : rc;
-	return rc;
 }
 
 /**
@@ -134,7 +134,7 @@  mlx5_devx_cmd_register_write(void *ctx, uint16_t reg_id, uint32_t arg,
 	uint32_t in[MLX5_ST_SZ_DW(access_register_in) +
 		    MLX5_ACCESS_REGISTER_DATA_DWORD_MAX] = {0};
 	uint32_t out[MLX5_ST_SZ_DW(access_register_out)] = {0};
-	int status, rc;
+	int rc;
 	void *ptr;
 
 	MLX5_ASSERT(data && dw_cnt);
@@ -152,26 +152,19 @@  mlx5_devx_cmd_register_write(void *ctx, uint16_t reg_id, uint32_t arg,
 	ptr = MLX5_ADDR_OF(access_register_in, in, register_data);
 	memcpy(ptr, data, dw_cnt * sizeof(uint32_t));
 	rc = mlx5_glue->devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out));
-
+	if (rc || MLX5_FW_STATUS(out)) {
+		mlx5_devx_err_log(out, "write access", "NIC register", reg_id);
+		return MLX5_DEVX_ERR_RC(rc);
+	}
 	rc = mlx5_glue->devx_general_cmd(ctx, in,
 					 MLX5_ST_SZ_BYTES(access_register_in) +
 					 dw_cnt * sizeof(uint32_t),
 					 out, sizeof(out));
-	if (rc)
-		goto error;
-	status = MLX5_GET(access_register_out, out, status);
-	if (status) {
-		int syndrome = MLX5_GET(access_register_out, out, syndrome);
-
-		DRV_LOG(DEBUG, "Failed to write access NIC register 0x%X, "
-			       "status %x, syndrome = %x",
-			       reg_id, status, syndrome);
-		return -1;
+	if (rc || MLX5_FW_STATUS(out)) {
+		mlx5_devx_err_log(out, "write access", "NIC register", reg_id);
+		return MLX5_DEVX_ERR_RC(rc);
 	}
 	return 0;
-error:
-	rc = (rc > 0) ? -rc : rc;
-	return rc;
 }
 
 /**
@@ -466,7 +459,7 @@  mlx5_devx_cmd_query_nic_vport_context(void *ctx,
 	uint32_t in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {0};
 	uint32_t out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {0};
 	void *vctx;
-	int status, syndrome, rc;
+	int rc;
 
 	/* Query NIC vport context to determine inline mode. */
 	MLX5_SET(query_nic_vport_context_in, in, opcode,
@@ -477,23 +470,15 @@  mlx5_devx_cmd_query_nic_vport_context(void *ctx,
 	rc = mlx5_glue->devx_general_cmd(ctx,
 					 in, sizeof(in),
 					 out, sizeof(out));
-	if (rc)
-		goto error;
-	status = MLX5_GET(query_nic_vport_context_out, out, status);
-	syndrome = MLX5_GET(query_nic_vport_context_out, out, syndrome);
-	if (status) {
-		DRV_LOG(DEBUG, "Failed to query NIC vport context, "
-			"status %x, syndrome = %x", status, syndrome);
-		return -1;
+	if (rc || MLX5_FW_STATUS(out)) {
+		mlx5_devx_err_log(out, "query NIC vport context", NULL, 0);
+		return MLX5_DEVX_ERR_RC(rc);
 	}
 	vctx = MLX5_ADDR_OF(query_nic_vport_context_out, out,
 			    nic_vport_context);
 	attr->vport_inline_mode = MLX5_GET(nic_vport_context, vctx,
 					   min_wqe_inline_mode);
 	return 0;
-error:
-	rc = (rc > 0) ? -rc : rc;
-	return rc;
 }
 
 /**