diff mbox series

[v3,08/17] net/mlx5: add HW steering counter action

Message ID	20220930125315.5079-9-suanmingm@nvidia.com (mailing list archive)
State	Superseded, archived
Delegated to:	Raslan Darawsheh
Headers	Received-SPF: Pass (protection.outlook.com: domain of nvidia.com designates 216.228.117.161 as permitted sender) receiver=protection.outlook.com; client-ip=216.228.117.161; helo=mail.nvidia.com; pr=C From: Suanming Mou <suanmingm@nvidia.com> To: Matan Azrad <matan@nvidia.com>, Viacheslav Ovsiienko <viacheslavo@nvidia.com>, Ray Kinsella <mdr@ashroe.eu> CC: <dev@dpdk.org>, <rasland@nvidia.com>, <orika@nvidia.com>, Xiaoyu Min <jackmin@nvidia.com> Subject: [PATCH v3 08/17] net/mlx5: add HW steering counter action Date: Fri, 30 Sep 2022 15:53:06 +0300 Message-ID: <20220930125315.5079-9-suanmingm@nvidia.com> In-Reply-To: <20220930125315.5079-1-suanmingm@nvidia.com> References: <20220923144334.27736-1-suanmingm@nvidia.com> <20220930125315.5079-1-suanmingm@nvidia.com> MIME-Version: 1.0 Content-Type: text/plain Precedence: list Errors-To: dev-bounces@dpdk.org
Series	net/mlx5: HW steering PMD update \| [v3,00/17] net/mlx5: HW steering PMD update [v3,01/17] net/mlx5: fix invalid flow attributes [v3,02/17] net/mlx5: fix IPv6 and TCP RSS hash fields [v3,03/17] net/mlx5: add shared header reformat support [v3,04/17] net/mlx5: add modify field hws support [v3,05/17] net/mlx5: add HW steering port action [v3,06/17] net/mlx5: add extended metadata mode for hardware steering [v3,07/17] net/mlx5: add HW steering meter action [v3,08/17] net/mlx5: add HW steering counter action [v3,09/17] net/mlx5: support DR action template API [v3,10/17] net/mlx5: add HW steering connection tracking support [v3,11/17] net/mlx5: add HW steering VLAN push, pop and VID modify flow actions [v3,12/17] net/mlx5: implement METER MARK indirect action for HWS [v3,13/17] net/mlx5: add HWS AGE action support [v3,14/17] net/mlx5: add async action push and pull support [v3,15/17] net/mlx5: support flow integrity in HWS group 0 [v3,16/17] net/mlx5: support device control for E-Switch default rule [v3,17/17] net/mlx5: support device control of representor matching

Checks

Context	Check	Description
ci/checkpatch	warning	coding style issues

Commit Message

Suanming Mou Sept. 30, 2022, 12:53 p.m. UTC

  From: Xiaoyu Min <jackmin@nvidia.com>

This commit adds HW steering counter action support.
Pool mechanism is the basic data structure for the HW steering counter.

The HW steering's counter pool is based on the rte_ring of zero-copy
variation.

There are two global rte_rings:
1. free_list:
     Store the counters indexes, which are ready for use.
2. wait_reset_list:
     Store the counters indexes, which are just freed from the user and
     need to query the hardware counter to get the reset value before
     this counter can be reused again.

The counter pool also supports cache per HW steering's queues, which are
also based on rte_ring of zero-copy variation.

The cache can be configured in size, preload, threshold, and fetch size,
they are all exposed via device args.

The main operations of the counter pool are as follows:

 - Get one counter from the pool:
   1. The user call _get_* API.
   2. If the cache is enabled, dequeue one counter index from the local
      cache:
      2.A: if the dequeued one from the local cache is still in reset
	status (counter's query_gen_when_free is equal to pool's query
	gen):
	I. Flush all counters in local cache back to global
	   wait_reset_list.
	II. Fetch _fetch_sz_ counters into the cache from the global
	    free list.
	III. Fetch one counter from the cache.
   3. If the cache is empty, fetch _fetch_sz_ counters from the global
      free list into the cache and fetch one counter from the cache.
 - Free one counter into the pool:
   1. The user calls _put_* API.
   2. Put the counter into the local cache.
   3. If the local cache is full:
      3.A: Write back all counters above _threshold_ into the global
           wait_reset_list.
      3.B: Also, write back this counter into the global wait_reset_list.

When the local cache is disabled, _get_/_put_ cache directly from/into
global list.

Signed-off-by: Xiaoyu Min <jackmin@nvidia.com>
---
 drivers/common/mlx5/mlx5_devx_cmds.c |  50 +++
 drivers/common/mlx5/mlx5_devx_cmds.h |  27 ++
 drivers/common/mlx5/mlx5_prm.h       |  62 ++-
 drivers/common/mlx5/version.map      |   1 +
 drivers/net/mlx5/meson.build         |   1 +
 drivers/net/mlx5/mlx5.c              |  14 +
 drivers/net/mlx5/mlx5.h              |  27 ++
 drivers/net/mlx5/mlx5_defs.h         |   2 +
 drivers/net/mlx5/mlx5_flow.c         |  27 +-
 drivers/net/mlx5/mlx5_flow.h         |   5 +
 drivers/net/mlx5/mlx5_flow_aso.c     | 261 ++++++++++++-
 drivers/net/mlx5/mlx5_flow_hw.c      | 340 +++++++++++++++-
 drivers/net/mlx5/mlx5_hws_cnt.c      | 528 +++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_hws_cnt.h      | 558 +++++++++++++++++++++++++++
 14 files changed, 1871 insertions(+), 32 deletions(-)
 create mode 100644 drivers/net/mlx5/mlx5_hws_cnt.c
 create mode 100644 drivers/net/mlx5/mlx5_hws_cnt.h

diff mbox series

Patch

diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c b/drivers/common/mlx5/mlx5_devx_cmds.c
index ac6891145d..eef7a98248 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.c
+++ b/drivers/common/mlx5/mlx5_devx_cmds.c
@@ -176,6 +176,41 @@  mlx5_devx_cmd_register_write(void *ctx, uint16_t reg_id, uint32_t arg,
 	return 0;
 }
 
+struct mlx5_devx_obj *
+mlx5_devx_cmd_flow_counter_alloc_general(void *ctx,
+		struct mlx5_devx_counter_attr *attr)
+{
+	struct mlx5_devx_obj *dcs = mlx5_malloc(MLX5_MEM_ZERO, sizeof(*dcs),
+						0, SOCKET_ID_ANY);
+	uint32_t in[MLX5_ST_SZ_DW(alloc_flow_counter_in)]   = {0};
+	uint32_t out[MLX5_ST_SZ_DW(alloc_flow_counter_out)] = {0};
+
+	if (!dcs) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+	MLX5_SET(alloc_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_ALLOC_FLOW_COUNTER);
+	if (attr->bulk_log_max_alloc)
+		MLX5_SET(alloc_flow_counter_in, in, flow_counter_bulk_log_size,
+			 attr->flow_counter_bulk_log_size);
+	else
+		MLX5_SET(alloc_flow_counter_in, in, flow_counter_bulk,
+			 attr->bulk_n_128);
+	if (attr->pd_valid)
+		MLX5_SET(alloc_flow_counter_in, in, pd, attr->pd);
+	dcs->obj = mlx5_glue->devx_obj_create(ctx, in,
+					      sizeof(in), out, sizeof(out));
+	if (!dcs->obj) {
+		DRV_LOG(ERR, "Can't allocate counters - error %d", errno);
+		rte_errno = errno;
+		mlx5_free(dcs);
+		return NULL;
+	}
+	dcs->id = MLX5_GET(alloc_flow_counter_out, out, flow_counter_id);
+	return dcs;
+}
+
 /**
  * Allocate flow counters via devx interface.
  *
@@ -967,6 +1002,16 @@  mlx5_devx_cmd_query_hca_attr(void *ctx,
 					 general_obj_types) &
 			      MLX5_GENERAL_OBJ_TYPES_CAP_CONN_TRACK_OFFLOAD);
 	attr->rq_delay_drop = MLX5_GET(cmd_hca_cap, hcattr, rq_delay_drop);
+	attr->max_flow_counter_15_0 = MLX5_GET(cmd_hca_cap, hcattr,
+			max_flow_counter_15_0);
+	attr->max_flow_counter_31_16 = MLX5_GET(cmd_hca_cap, hcattr,
+			max_flow_counter_31_16);
+	attr->alloc_flow_counter_pd = MLX5_GET(cmd_hca_cap, hcattr,
+			alloc_flow_counter_pd);
+	attr->flow_counter_access_aso = MLX5_GET(cmd_hca_cap, hcattr,
+			flow_counter_access_aso);
+	attr->flow_access_aso_opc_mod = MLX5_GET(cmd_hca_cap, hcattr,
+			flow_access_aso_opc_mod);
 	if (attr->crypto) {
 		attr->aes_xts = MLX5_GET(cmd_hca_cap, hcattr, aes_xts);
 		hcattr = mlx5_devx_get_hca_cap(ctx, in, out, &rc,
@@ -989,6 +1034,11 @@  mlx5_devx_cmd_query_hca_attr(void *ctx,
 		}
 		attr->log_min_stride_wqe_sz = MLX5_GET(cmd_hca_cap_2, hcattr,
 						       log_min_stride_wqe_sz);
+		attr->flow_counter_bulk_log_max_alloc = MLX5_GET(cmd_hca_cap_2,
+				hcattr, flow_counter_bulk_log_max_alloc);
+		attr->flow_counter_bulk_log_granularity =
+			MLX5_GET(cmd_hca_cap_2, hcattr,
+				 flow_counter_bulk_log_granularity);
 	}
 	if (attr->log_min_stride_wqe_sz == 0)
 		attr->log_min_stride_wqe_sz = MLX5_MPRQ_LOG_MIN_STRIDE_WQE_SIZE;
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h b/drivers/common/mlx5/mlx5_devx_cmds.h
index d69dad613e..15b46f2acd 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -15,6 +15,16 @@ 
 #define MLX5_DEVX_MAX_KLM_ENTRIES ((UINT16_MAX - \
 		MLX5_ST_SZ_DW(create_mkey_in) * 4) / (MLX5_ST_SZ_DW(klm) * 4))
 
+struct mlx5_devx_counter_attr {
+	uint32_t pd_valid:1;
+	uint32_t pd:24;
+	uint32_t bulk_log_max_alloc:1;
+	union {
+		uint8_t flow_counter_bulk_log_size;
+		uint8_t bulk_n_128;
+	};
+};
+
 struct mlx5_devx_mkey_attr {
 	uint64_t addr;
 	uint64_t size;
@@ -263,6 +273,18 @@  struct mlx5_hca_attr {
 	uint32_t set_reg_c:8;
 	uint32_t nic_flow_table:1;
 	uint32_t modify_outer_ip_ecn:1;
+	union {
+		uint32_t max_flow_counter;
+		struct {
+			uint16_t max_flow_counter_15_0;
+			uint16_t max_flow_counter_31_16;
+		};
+	};
+	uint32_t flow_counter_bulk_log_max_alloc:5;
+	uint32_t flow_counter_bulk_log_granularity:5;
+	uint32_t alloc_flow_counter_pd:1;
+	uint32_t flow_counter_access_aso:1;
+	uint32_t flow_access_aso_opc_mod:8;
 };
 
 /* LAG Context. */
@@ -593,6 +615,11 @@  struct mlx5_devx_crypto_login_attr {
 
 /* mlx5_devx_cmds.c */
 
+__rte_internal
+struct mlx5_devx_obj *
+mlx5_devx_cmd_flow_counter_alloc_general(void *ctx,
+				struct mlx5_devx_counter_attr *attr);
+
 __rte_internal
 struct mlx5_devx_obj *mlx5_devx_cmd_flow_counter_alloc(void *ctx,
 						       uint32_t bulk_sz);
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index c82ec94465..8514ca8fc4 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -1161,8 +1161,10 @@  struct mlx5_ifc_alloc_flow_counter_in_bits {
 	u8 reserved_at_10[0x10];
 	u8 reserved_at_20[0x10];
 	u8 op_mod[0x10];
-	u8 flow_counter_id[0x20];
-	u8 reserved_at_40[0x18];
+	u8 reserved_at_40[0x8];
+	u8 pd[0x18];
+	u8 reserved_at_60[0x13];
+	u8 flow_counter_bulk_log_size[0x5];
 	u8 flow_counter_bulk[0x8];
 };
 
@@ -1382,7 +1384,13 @@  enum {
 #define MLX5_STEERING_LOGIC_FORMAT_CONNECTX_6DX 0x1
 
 struct mlx5_ifc_cmd_hca_cap_bits {
-	u8 reserved_at_0[0x20];
+	u8 access_other_hca_roce[0x1];
+	u8 alloc_flow_counter_pd[0x1];
+	u8 flow_counter_access_aso[0x1];
+	u8 reserved_at_3[0x5];
+	u8 flow_access_aso_opc_mod[0x8];
+	u8 reserved_at_10[0xf];
+	u8 vhca_resource_manager[0x1];
 	u8 hca_cap_2[0x1];
 	u8 reserved_at_21[0xf];
 	u8 vhca_id[0x10];
@@ -2058,8 +2066,52 @@  struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8 log_conn_track_max_alloc[0x5];
 	u8 reserved_at_d8[0x3];
 	u8 log_max_conn_track_offload[0x5];
-	u8 reserved_at_e0[0x20]; /* End of DW7. */
-	u8 reserved_at_100[0x700];
+	u8 reserved_at_e0[0xc0];
+	u8 reserved_at_1a0[0xb];
+	u8 format_select_dw_8_6_ext[0x1];
+	u8 reserved_at_1ac[0x14];
+	u8 general_obj_types_127_64[0x40];
+	u8 reserved_at_200[0x53];
+	u8 flow_counter_bulk_log_max_alloc[0x5];
+	u8 reserved_at_258[0x3];
+	u8 flow_counter_bulk_log_granularity[0x5];
+	u8 reserved_at_260[0x20];
+	u8 format_select_dw_gtpu_dw_0[0x8];
+	u8 format_select_dw_gtpu_dw_1[0x8];
+	u8 format_select_dw_gtpu_dw_2[0x8];
+	u8 format_select_dw_gtpu_first_ext_dw_0[0x8];
+	u8 reserved_at_2a0[0x560];
+};
+
+struct mlx5_ifc_wqe_based_flow_table_cap_bits {
+	u8 reserved_at_0[0x3];
+	u8 log_max_num_ste[0x5];
+	u8 reserved_at_8[0x3];
+	u8 log_max_num_stc[0x5];
+	u8 reserved_at_10[0x3];
+	u8 log_max_num_rtc[0x5];
+	u8 reserved_at_18[0x3];
+	u8 log_max_num_header_modify_pattern[0x5];
+	u8 reserved_at_20[0x3];
+	u8 stc_alloc_log_granularity[0x5];
+	u8 reserved_at_28[0x3];
+	u8 stc_alloc_log_max[0x5];
+	u8 reserved_at_30[0x3];
+	u8 ste_alloc_log_granularity[0x5];
+	u8 reserved_at_38[0x3];
+	u8 ste_alloc_log_max[0x5];
+	u8 reserved_at_40[0xb];
+	u8 rtc_reparse_mode[0x5];
+	u8 reserved_at_50[0x3];
+	u8 rtc_index_mode[0x5];
+	u8 reserved_at_58[0x3];
+	u8 rtc_log_depth_max[0x5];
+	u8 reserved_at_60[0x10];
+	u8 ste_format[0x10];
+	u8 stc_action_type[0x80];
+	u8 header_insert_type[0x10];
+	u8 header_remove_type[0x10];
+	u8 trivial_match_definer[0x20];
 };
 
 struct mlx5_ifc_esw_cap_bits {
diff --git a/drivers/common/mlx5/version.map b/drivers/common/mlx5/version.map
index 413dec14ab..4f72900519 100644
--- a/drivers/common/mlx5/version.map
+++ b/drivers/common/mlx5/version.map
@@ -40,6 +40,7 @@  INTERNAL {
 	mlx5_devx_cmd_create_virtq;
 	mlx5_devx_cmd_destroy;
 	mlx5_devx_cmd_flow_counter_alloc;
+	mlx5_devx_cmd_flow_counter_alloc_general;
 	mlx5_devx_cmd_flow_counter_query;
 	mlx5_devx_cmd_flow_dump;
 	mlx5_devx_cmd_flow_single_dump;
diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
index 6a84d96380..f2d7bcaff6 100644
--- a/drivers/net/mlx5/meson.build
+++ b/drivers/net/mlx5/meson.build
@@ -38,6 +38,7 @@  sources = files(
         'mlx5_vlan.c',
         'mlx5_utils.c',
         'mlx5_devx.c',
+	'mlx5_hws_cnt.c',
 )
 
 if is_linux
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index cf5146d677..b6a66f12ee 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -175,6 +175,12 @@ 
 /* Device parameter to create the fdb default rule in PMD */
 #define MLX5_FDB_DEFAULT_RULE_EN "fdb_def_rule_en"
 
+/* HW steering counter configuration. */
+#define MLX5_HWS_CNT_SERVICE_CORE "service_core"
+
+/* HW steering counter's query interval. */
+#define MLX5_HWS_CNT_CYCLE_TIME "svc_cycle_time"
+
 /* Shared memory between primary and secondary processes. */
 struct mlx5_shared_data *mlx5_shared_data;
 
@@ -1245,6 +1251,10 @@  mlx5_dev_args_check_handler(const char *key, const char *val, void *opaque)
 		config->allow_duplicate_pattern = !!tmp;
 	} else if (strcmp(MLX5_FDB_DEFAULT_RULE_EN, key) == 0) {
 		config->fdb_def_rule = !!tmp;
+	} else if (strcmp(MLX5_HWS_CNT_SERVICE_CORE, key) == 0) {
+		config->cnt_svc.service_core = tmp;
+	} else if (strcmp(MLX5_HWS_CNT_CYCLE_TIME, key) == 0) {
+		config->cnt_svc.cycle_time = tmp;
 	}
 	return 0;
 }
@@ -1281,6 +1291,8 @@  mlx5_shared_dev_ctx_args_config(struct mlx5_dev_ctx_shared *sh,
 		MLX5_DECAP_EN,
 		MLX5_ALLOW_DUPLICATE_PATTERN,
 		MLX5_FDB_DEFAULT_RULE_EN,
+		MLX5_HWS_CNT_SERVICE_CORE,
+		MLX5_HWS_CNT_CYCLE_TIME,
 		NULL,
 	};
 	int ret = 0;
@@ -1293,6 +1305,8 @@  mlx5_shared_dev_ctx_args_config(struct mlx5_dev_ctx_shared *sh,
 	config->decap_en = 1;
 	config->allow_duplicate_pattern = 1;
 	config->fdb_def_rule = 1;
+	config->cnt_svc.cycle_time = MLX5_CNT_SVC_CYCLE_TIME_DEFAULT;
+	config->cnt_svc.service_core = rte_get_main_lcore();
 	if (mkvlist != NULL) {
 		/* Process parameters. */
 		ret = mlx5_kvargs_process(mkvlist, params,
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 686969719a..4859f5a509 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -308,6 +308,10 @@  struct mlx5_sh_config {
 	uint32_t hw_fcs_strip:1; /* FCS stripping is supported. */
 	uint32_t allow_duplicate_pattern:1;
 	uint32_t lro_allowed:1; /* Whether LRO is allowed. */
+	struct {
+		uint16_t service_core;
+		uint32_t cycle_time; /* query cycle time in milli-second. */
+	} cnt_svc; /* configure for HW steering's counter's service. */
 	/* Allow/Prevent the duplicate rules pattern. */
 	uint32_t fdb_def_rule:1; /* Create FDB default jump rule */
 };
@@ -1224,6 +1228,22 @@  struct mlx5_flex_item {
 	struct mlx5_flex_pattern_field map[MLX5_FLEX_ITEM_MAPPING_NUM];
 };
 
+#define HWS_CNT_ASO_SQ_NUM 4
+
+struct mlx5_hws_aso_mng {
+	uint16_t sq_num;
+	struct mlx5_aso_sq sqs[HWS_CNT_ASO_SQ_NUM];
+};
+
+struct mlx5_hws_cnt_svc_mng {
+	uint32_t refcnt;
+	uint32_t service_core;
+	uint32_t query_interval;
+	pthread_t service_thread;
+	uint8_t svc_running;
+	struct mlx5_hws_aso_mng aso_mng __rte_cache_aligned;
+};
+
 /*
  * Shared Infiniband device context for Master/Representors
  * which belong to same IB device with multiple IB ports.
@@ -1323,6 +1343,7 @@  struct mlx5_dev_ctx_shared {
 	pthread_mutex_t lwm_config_lock;
 	uint32_t host_shaper_rate:8;
 	uint32_t lwm_triggered:1;
+	struct mlx5_hws_cnt_svc_mng *cnt_svc;
 	struct mlx5_dev_shared_port port[]; /* per device port data array. */
 };
 
@@ -1607,6 +1628,7 @@  struct mlx5_priv {
 	/* Flex items have been created on the port. */
 	uint32_t flex_item_map; /* Map of allocated flex item elements. */
 	uint32_t nb_queue; /* HW steering queue number. */
+	struct mlx5_hws_cnt_pool *hws_cpool; /* HW steering's counter pool. */
 #if defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_INFINIBAND_VERBS_H)
 	/* Item template list. */
 	LIST_HEAD(flow_hw_itt, rte_flow_pattern_template) flow_hw_itt;
@@ -2037,6 +2059,11 @@  mlx5_get_supported_sw_parsing_offloads(const struct mlx5_hca_attr *attr);
 uint32_t
 mlx5_get_supported_tunneling_offloads(const struct mlx5_hca_attr *attr);
 
+int mlx5_aso_cnt_queue_init(struct mlx5_dev_ctx_shared *sh);
+void mlx5_aso_cnt_queue_uninit(struct mlx5_dev_ctx_shared *sh);
+int mlx5_aso_cnt_query(struct mlx5_dev_ctx_shared *sh,
+		struct mlx5_hws_cnt_pool *cpool);
+
 /* mlx5_flow_flex.c */
 
 struct rte_flow_item_flex_handle *
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 585afb0a98..d064abfef3 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -188,4 +188,6 @@ 
 #define static_assert _Static_assert
 #endif
 
+#define MLX5_CNT_SVC_CYCLE_TIME_DEFAULT 500
+
 #endif /* RTE_PMD_MLX5_DEFS_H_ */
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index fb3be940e5..658cc69750 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -7832,24 +7832,33 @@  mlx5_flow_isolate(struct rte_eth_dev *dev,
  */
 static int
 flow_drv_query(struct rte_eth_dev *dev,
-	       uint32_t flow_idx,
+	       struct rte_flow *eflow,
 	       const struct rte_flow_action *actions,
 	       void *data,
 	       struct rte_flow_error *error)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	const struct mlx5_flow_driver_ops *fops;
-	struct rte_flow *flow = mlx5_ipool_get(priv->flows[MLX5_FLOW_TYPE_GEN],
-					       flow_idx);
-	enum mlx5_flow_drv_type ftype;
+	struct rte_flow *flow = NULL;
+	enum mlx5_flow_drv_type ftype = MLX5_FLOW_TYPE_MIN;
 
+	if (priv->sh->config.dv_flow_en == 2) {
+#if defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_INFINIBAND_VERBS_H)
+		flow = eflow;
+		ftype = MLX5_FLOW_TYPE_HW;
+#endif
+	} else {
+		flow = (struct rte_flow *)mlx5_ipool_get(priv->flows[MLX5_FLOW_TYPE_GEN],
+				(uintptr_t)(void *)eflow);
+	}
 	if (!flow) {
 		return rte_flow_error_set(error, ENOENT,
 			  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
 			  NULL,
 			  "invalid flow handle");
 	}
-	ftype = flow->drv_type;
+	if (ftype == MLX5_FLOW_TYPE_MIN)
+		ftype = flow->drv_type;
 	MLX5_ASSERT(ftype > MLX5_FLOW_TYPE_MIN && ftype < MLX5_FLOW_TYPE_MAX);
 	fops = flow_get_drv_ops(ftype);
 
@@ -7870,14 +7879,8 @@  mlx5_flow_query(struct rte_eth_dev *dev,
 		struct rte_flow_error *error)
 {
 	int ret;
-	struct mlx5_priv *priv = dev->data->dev_private;
 
-	if (priv->sh->config.dv_flow_en == 2)
-		return rte_flow_error_set(error, ENOTSUP,
-			  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-			  NULL,
-			  "Flow non-Q query not supported");
-	ret = flow_drv_query(dev, (uintptr_t)(void *)flow, actions, data,
+	ret = flow_drv_query(dev, flow, actions, data,
 			     error);
 	if (ret < 0)
 		return ret;
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 3bde95c927..8f1b66eaac 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -1103,6 +1103,7 @@  struct rte_flow_hw {
 	};
 	struct rte_flow_template_table *table; /* The table flow allcated from. */
 	struct mlx5dr_rule rule; /* HWS layer data struct. */
+	uint32_t cnt_id;
 } __rte_packed;
 
 /* rte flow action translate to DR action struct. */
@@ -1146,6 +1147,9 @@  struct mlx5_action_construct_data {
 			uint32_t level; /* RSS level. */
 			uint32_t idx; /* Shared action index. */
 		} shared_rss;
+		struct {
+			uint32_t id;
+		} shared_counter;
 	};
 };
 
@@ -1224,6 +1228,7 @@  struct mlx5_hw_actions {
 	uint16_t encap_decap_pos; /* Encap/Decap action position. */
 	uint32_t acts_num:4; /* Total action number. */
 	uint32_t mark:1; /* Indicate the mark action. */
+	uint32_t cnt_id; /* Counter id. */
 	/* Translated DR action array from action template. */
 	struct mlx5dr_rule_action rule_acts[MLX5_HW_MAX_ACTS];
 };
diff --git a/drivers/net/mlx5/mlx5_flow_aso.c b/drivers/net/mlx5/mlx5_flow_aso.c
index 60d0280367..ed9272e583 100644
--- a/drivers/net/mlx5/mlx5_flow_aso.c
+++ b/drivers/net/mlx5/mlx5_flow_aso.c
@@ -12,6 +12,9 @@ 
 
 #include "mlx5.h"
 #include "mlx5_flow.h"
+#include "mlx5_hws_cnt.h"
+
+#define MLX5_ASO_CNT_QUEUE_LOG_DESC 14
 
 /**
  * Free MR resources.
@@ -79,6 +82,33 @@  mlx5_aso_destroy_sq(struct mlx5_aso_sq *sq)
 	memset(sq, 0, sizeof(*sq));
 }
 
+/**
+ * Initialize Send Queue used for ASO access counter.
+ *
+ * @param[in] sq
+ *   ASO SQ to initialize.
+ */
+static void
+mlx5_aso_cnt_init_sq(struct mlx5_aso_sq *sq)
+{
+	volatile struct mlx5_aso_wqe *restrict wqe;
+	int i;
+	int size = 1 << sq->log_desc_n;
+
+	/* All the next fields state should stay constant. */
+	for (i = 0, wqe = &sq->sq_obj.aso_wqes[0]; i < size; ++i, ++wqe) {
+		wqe->general_cseg.sq_ds = rte_cpu_to_be_32((sq->sqn << 8) |
+							  (sizeof(*wqe) >> 4));
+		wqe->aso_cseg.operand_masks = rte_cpu_to_be_32
+			(0u |
+			 (ASO_OPER_LOGICAL_OR << ASO_CSEG_COND_OPER_OFFSET) |
+			 (ASO_OP_ALWAYS_FALSE << ASO_CSEG_COND_1_OPER_OFFSET) |
+			 (ASO_OP_ALWAYS_FALSE << ASO_CSEG_COND_0_OPER_OFFSET) |
+			 (BYTEWISE_64BYTE << ASO_CSEG_DATA_MASK_MODE_OFFSET));
+		wqe->aso_cseg.data_mask = RTE_BE64(UINT64_MAX);
+	}
+}
+
 /**
  * Initialize Send Queue used for ASO access.
  *
@@ -191,7 +221,7 @@  mlx5_aso_ct_init_sq(struct mlx5_aso_sq *sq)
  */
 static int
 mlx5_aso_sq_create(struct mlx5_common_device *cdev, struct mlx5_aso_sq *sq,
-		   void *uar)
+		   void *uar, uint16_t log_desc_n)
 {
 	struct mlx5_devx_cq_attr cq_attr = {
 		.uar_page_id = mlx5_os_get_devx_uar_page_id(uar),
@@ -212,12 +242,12 @@  mlx5_aso_sq_create(struct mlx5_common_device *cdev, struct mlx5_aso_sq *sq,
 	int ret;
 
 	if (mlx5_devx_cq_create(cdev->ctx, &sq->cq.cq_obj,
-				MLX5_ASO_QUEUE_LOG_DESC, &cq_attr,
+				log_desc_n, &cq_attr,
 				SOCKET_ID_ANY))
 		goto error;
 	sq->cq.cq_ci = 0;
-	sq->cq.log_desc_n = MLX5_ASO_QUEUE_LOG_DESC;
-	sq->log_desc_n = MLX5_ASO_QUEUE_LOG_DESC;
+	sq->cq.log_desc_n = log_desc_n;
+	sq->log_desc_n = log_desc_n;
 	sq_attr.cqn = sq->cq.cq_obj.cq->id;
 	/* for mlx5_aso_wqe that is twice the size of mlx5_wqe */
 	log_wqbb_n = sq->log_desc_n + 1;
@@ -269,7 +299,8 @@  mlx5_aso_queue_init(struct mlx5_dev_ctx_shared *sh,
 				    sq_desc_n, &sh->aso_age_mng->aso_sq.mr))
 			return -1;
 		if (mlx5_aso_sq_create(cdev, &sh->aso_age_mng->aso_sq,
-				       sh->tx_uar.obj)) {
+				       sh->tx_uar.obj,
+				       MLX5_ASO_QUEUE_LOG_DESC)) {
 			mlx5_aso_dereg_mr(cdev, &sh->aso_age_mng->aso_sq.mr);
 			return -1;
 		}
@@ -277,7 +308,7 @@  mlx5_aso_queue_init(struct mlx5_dev_ctx_shared *sh,
 		break;
 	case ASO_OPC_MOD_POLICER:
 		if (mlx5_aso_sq_create(cdev, &sh->mtrmng->pools_mng.sq,
-				       sh->tx_uar.obj))
+				       sh->tx_uar.obj, MLX5_ASO_QUEUE_LOG_DESC))
 			return -1;
 		mlx5_aso_mtr_init_sq(&sh->mtrmng->pools_mng.sq);
 		break;
@@ -287,7 +318,7 @@  mlx5_aso_queue_init(struct mlx5_dev_ctx_shared *sh,
 				    &sh->ct_mng->aso_sq.mr))
 			return -1;
 		if (mlx5_aso_sq_create(cdev, &sh->ct_mng->aso_sq,
-				       sh->tx_uar.obj)) {
+				       sh->tx_uar.obj, MLX5_ASO_QUEUE_LOG_DESC)) {
 			mlx5_aso_dereg_mr(cdev, &sh->ct_mng->aso_sq.mr);
 			return -1;
 		}
@@ -1403,3 +1434,219 @@  mlx5_aso_ct_available(struct mlx5_dev_ctx_shared *sh,
 	rte_errno = EBUSY;
 	return -rte_errno;
 }
+
+int
+mlx5_aso_cnt_queue_init(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_hws_aso_mng *aso_mng = NULL;
+	uint8_t idx;
+	struct mlx5_aso_sq *sq;
+
+	MLX5_ASSERT(sh);
+	MLX5_ASSERT(sh->cnt_svc);
+	aso_mng = &sh->cnt_svc->aso_mng;
+	aso_mng->sq_num = HWS_CNT_ASO_SQ_NUM;
+	for (idx = 0; idx < HWS_CNT_ASO_SQ_NUM; idx++) {
+		sq = &aso_mng->sqs[idx];
+		if (mlx5_aso_sq_create(sh->cdev, sq, sh->tx_uar.obj,
+					MLX5_ASO_CNT_QUEUE_LOG_DESC))
+			goto error;
+		mlx5_aso_cnt_init_sq(sq);
+	}
+	return 0;
+error:
+	mlx5_aso_cnt_queue_uninit(sh);
+	return -1;
+}
+
+void
+mlx5_aso_cnt_queue_uninit(struct mlx5_dev_ctx_shared *sh)
+{
+	uint16_t idx;
+
+	for (idx = 0; idx < sh->cnt_svc->aso_mng.sq_num; idx++)
+		mlx5_aso_destroy_sq(&sh->cnt_svc->aso_mng.sqs[idx]);
+	sh->cnt_svc->aso_mng.sq_num = 0;
+}
+
+static uint16_t
+mlx5_aso_cnt_sq_enqueue_burst(struct mlx5_hws_cnt_pool *cpool,
+		struct mlx5_dev_ctx_shared *sh,
+		struct mlx5_aso_sq *sq, uint32_t n,
+		uint32_t offset, uint32_t dcs_id_base)
+{
+	volatile struct mlx5_aso_wqe *wqe;
+	uint16_t size = 1 << sq->log_desc_n;
+	uint16_t mask = size - 1;
+	uint16_t max;
+	uint32_t upper_offset = offset;
+	uint64_t addr;
+	uint32_t ctrl_gen_id = 0;
+	uint8_t opcmod = sh->cdev->config.hca_attr.flow_access_aso_opc_mod;
+	rte_be32_t lkey = rte_cpu_to_be_32(cpool->raw_mng->mr.lkey);
+	uint16_t aso_n = (uint16_t)(RTE_ALIGN_CEIL(n, 4) / 4);
+	uint32_t ccntid;
+
+	max = RTE_MIN(size - (uint16_t)(sq->head - sq->tail), aso_n);
+	if (unlikely(!max))
+		return 0;
+	upper_offset += (max * 4);
+	/* Because only one burst at one time, we can use the same elt. */
+	sq->elts[0].burst_size = max;
+	ctrl_gen_id = dcs_id_base;
+	ctrl_gen_id /= 4;
+	do {
+		ccntid = upper_offset - max * 4;
+		wqe = &sq->sq_obj.aso_wqes[sq->head & mask];
+		rte_prefetch0(&sq->sq_obj.aso_wqes[(sq->head + 1) & mask]);
+		wqe->general_cseg.misc = rte_cpu_to_be_32(ctrl_gen_id);
+		wqe->general_cseg.flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR <<
+							 MLX5_COMP_MODE_OFFSET);
+		wqe->general_cseg.opcode = rte_cpu_to_be_32
+						(MLX5_OPCODE_ACCESS_ASO |
+						 (opcmod <<
+						  WQE_CSEG_OPC_MOD_OFFSET) |
+						 (sq->pi <<
+						  WQE_CSEG_WQE_INDEX_OFFSET));
+		addr = (uint64_t)RTE_PTR_ADD(cpool->raw_mng->raw,
+				ccntid * sizeof(struct flow_counter_stats));
+		wqe->aso_cseg.va_h = rte_cpu_to_be_32((uint32_t)(addr >> 32));
+		wqe->aso_cseg.va_l_r = rte_cpu_to_be_32((uint32_t)addr | 1u);
+		wqe->aso_cseg.lkey = lkey;
+		sq->pi += 2; /* Each WQE contains 2 WQEBB's. */
+		sq->head++;
+		sq->next++;
+		ctrl_gen_id++;
+		max--;
+	} while (max);
+	wqe->general_cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
+							 MLX5_COMP_MODE_OFFSET);
+	mlx5_doorbell_ring(&sh->tx_uar.bf_db, *(volatile uint64_t *)wqe,
+			   sq->pi, &sq->sq_obj.db_rec[MLX5_SND_DBR],
+			   !sh->tx_uar.dbnc);
+	return sq->elts[0].burst_size;
+}
+
+static uint16_t
+mlx5_aso_cnt_completion_handle(struct mlx5_aso_sq *sq)
+{
+	struct mlx5_aso_cq *cq = &sq->cq;
+	volatile struct mlx5_cqe *restrict cqe;
+	const unsigned int cq_size = 1 << cq->log_desc_n;
+	const unsigned int mask = cq_size - 1;
+	uint32_t idx;
+	uint32_t next_idx = cq->cq_ci & mask;
+	const uint16_t max = (uint16_t)(sq->head - sq->tail);
+	uint16_t i = 0;
+	int ret;
+	if (unlikely(!max))
+		return 0;
+	idx = next_idx;
+	next_idx = (cq->cq_ci + 1) & mask;
+	rte_prefetch0(&cq->cq_obj.cqes[next_idx]);
+	cqe = &cq->cq_obj.cqes[idx];
+	ret = check_cqe(cqe, cq_size, cq->cq_ci);
+	/*
+	 * Be sure owner read is done before any other cookie field or
+	 * opaque field.
+	 */
+	rte_io_rmb();
+	if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
+		if (likely(ret == MLX5_CQE_STATUS_HW_OWN))
+			return 0; /* return immediately. */
+		mlx5_aso_cqe_err_handle(sq);
+	}
+	i += sq->elts[0].burst_size;
+	sq->elts[0].burst_size = 0;
+	cq->cq_ci++;
+	if (likely(i)) {
+		sq->tail += i;
+		rte_io_wmb();
+		cq->cq_obj.db_rec[0] = rte_cpu_to_be_32(cq->cq_ci);
+	}
+	return i;
+}
+
+static uint16_t
+mlx5_aso_cnt_query_one_dcs(struct mlx5_dev_ctx_shared *sh,
+			   struct mlx5_hws_cnt_pool *cpool,
+			   uint8_t dcs_idx, uint32_t num)
+{
+	uint32_t dcs_id = cpool->dcs_mng.dcs[dcs_idx].obj->id;
+	uint64_t cnt_num = cpool->dcs_mng.dcs[dcs_idx].batch_sz;
+	uint64_t left;
+	uint32_t iidx = cpool->dcs_mng.dcs[dcs_idx].iidx;
+	uint32_t offset;
+	uint16_t mask;
+	uint16_t sq_idx;
+	uint64_t burst_sz = (uint64_t)(1 << MLX5_ASO_CNT_QUEUE_LOG_DESC) * 4 *
+		sh->cnt_svc->aso_mng.sq_num;
+	uint64_t qburst_sz = burst_sz / sh->cnt_svc->aso_mng.sq_num;
+	uint64_t n;
+	struct mlx5_aso_sq *sq;
+
+	cnt_num = RTE_MIN(num, cnt_num);
+	left = cnt_num;
+	while (left) {
+		mask = 0;
+		for (sq_idx = 0; sq_idx < sh->cnt_svc->aso_mng.sq_num;
+				sq_idx++) {
+			if (left == 0) {
+				mask |= (1 << sq_idx);
+				continue;
+			}
+			n = RTE_MIN(left, qburst_sz);
+			offset = cnt_num - left;
+			offset += iidx;
+			mlx5_aso_cnt_sq_enqueue_burst(cpool, sh,
+					&sh->cnt_svc->aso_mng.sqs[sq_idx], n,
+					offset, dcs_id);
+			left -= n;
+		}
+		do {
+			for (sq_idx = 0; sq_idx < sh->cnt_svc->aso_mng.sq_num;
+					sq_idx++) {
+				sq = &sh->cnt_svc->aso_mng.sqs[sq_idx];
+				if (mlx5_aso_cnt_completion_handle(sq))
+					mask |= (1 << sq_idx);
+			}
+		} while (mask < ((1 << sh->cnt_svc->aso_mng.sq_num) - 1));
+	}
+	return cnt_num;
+}
+
+/*
+ * Query FW counter via ASO WQE.
+ *
+ * ASO query counter use _sync_ mode, means:
+ * 1. each SQ issue one burst with several WQEs
+ * 2. ask for CQE at last WQE
+ * 3. busy poll CQ of each SQ's
+ * 4. If all SQ's CQE are received then goto step 1, issue next burst
+ *
+ * @param[in] sh
+ *   Pointer to shared device.
+ * @param[in] cpool
+ *   Pointer to counter pool.
+ *
+ * @return
+ *   0 on success, -1 on failure.
+ */
+int
+mlx5_aso_cnt_query(struct mlx5_dev_ctx_shared *sh,
+		   struct mlx5_hws_cnt_pool *cpool)
+{
+	uint32_t idx;
+	uint32_t num;
+	uint32_t cnt_num = mlx5_hws_cnt_pool_get_size(cpool) -
+		rte_ring_count(cpool->free_list);
+
+	for (idx = 0; idx < cpool->dcs_mng.batch_total; idx++) {
+		num = RTE_MIN(cnt_num, cpool->dcs_mng.dcs[idx].batch_sz);
+		mlx5_aso_cnt_query_one_dcs(sh, cpool, idx, num);
+		cnt_num -= num;
+		if (cnt_num == 0)
+			break;
+	}
+	return 0;
+}
diff --git a/drivers/net/mlx5/mlx5_flow_hw.c b/drivers/net/mlx5/mlx5_flow_hw.c
index c2e16bc56d..507abb54e4 100644
--- a/drivers/net/mlx5/mlx5_flow_hw.c
+++ b/drivers/net/mlx5/mlx5_flow_hw.c
@@ -10,6 +10,7 @@ 
 #include "mlx5_rx.h"
 
 #if defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_INFINIBAND_VERBS_H)
+#include "mlx5_hws_cnt.h"
 
 /* The maximum actions support in the flow. */
 #define MLX5_HW_MAX_ACTS 16
@@ -353,6 +354,10 @@  __flow_hw_action_template_destroy(struct rte_eth_dev *dev,
 			mlx5dr_action_destroy(acts->mhdr->action);
 		mlx5_free(acts->mhdr);
 	}
+	if (mlx5_hws_cnt_id_valid(acts->cnt_id)) {
+		mlx5_hws_cnt_shared_put(priv->hws_cpool, &acts->cnt_id);
+		acts->cnt_id = 0;
+	}
 }
 
 /**
@@ -532,6 +537,44 @@  __flow_hw_act_data_shared_rss_append(struct mlx5_priv *priv,
 	return 0;
 }
 
+/**
+ * Append shared counter action to the dynamic action list.
+ *
+ * @param[in] priv
+ *   Pointer to the port private data structure.
+ * @param[in] acts
+ *   Pointer to the template HW steering DR actions.
+ * @param[in] type
+ *   Action type.
+ * @param[in] action_src
+ *   Offset of source rte flow action.
+ * @param[in] action_dst
+ *   Offset of destination DR action.
+ * @param[in] cnt_id
+ *   Shared counter id.
+ *
+ * @return
+ *    0 on success, negative value otherwise and rte_errno is set.
+ */
+static __rte_always_inline int
+__flow_hw_act_data_shared_cnt_append(struct mlx5_priv *priv,
+				     struct mlx5_hw_actions *acts,
+				     enum rte_flow_action_type type,
+				     uint16_t action_src,
+				     uint16_t action_dst,
+				     cnt_id_t cnt_id)
+{	struct mlx5_action_construct_data *act_data;
+
+	act_data = __flow_hw_act_data_alloc(priv, type, action_src, action_dst);
+	if (!act_data)
+		return -1;
+	act_data->type = type;
+	act_data->shared_counter.id = cnt_id;
+	LIST_INSERT_HEAD(&acts->act_list, act_data, next);
+	return 0;
+}
+
+
 /**
  * Translate shared indirect action.
  *
@@ -573,6 +616,13 @@  flow_hw_shared_action_translate(struct rte_eth_dev *dev,
 		    action_src, action_dst, idx, shared_rss))
 			return -1;
 		break;
+	case MLX5_INDIRECT_ACTION_TYPE_COUNT:
+		if (__flow_hw_act_data_shared_cnt_append(priv, acts,
+			(enum rte_flow_action_type)
+			MLX5_RTE_FLOW_ACTION_TYPE_COUNT,
+			action_src, action_dst, act_idx))
+			return -1;
+		break;
 	default:
 		DRV_LOG(WARNING, "Unsupported shared action type:%d", type);
 		break;
@@ -946,6 +996,30 @@  flow_hw_meter_compile(struct rte_eth_dev *dev,
 	}
 	return 0;
 }
+
+static __rte_always_inline int
+flow_hw_cnt_compile(struct rte_eth_dev *dev, uint32_t  start_pos,
+		      struct mlx5_hw_actions *acts)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint32_t pos = start_pos;
+	cnt_id_t cnt_id;
+	int ret;
+
+	ret = mlx5_hws_cnt_shared_get(priv->hws_cpool, &cnt_id);
+	if (ret != 0)
+		return ret;
+	ret = mlx5_hws_cnt_pool_get_action_offset
+				(priv->hws_cpool,
+				 cnt_id,
+				 &acts->rule_acts[pos].action,
+				 &acts->rule_acts[pos].counter.offset);
+	if (ret != 0)
+		return ret;
+	acts->cnt_id = cnt_id;
+	return 0;
+}
+
 /**
  * Translate rte_flow actions to DR action.
  *
@@ -1189,6 +1263,20 @@  flow_hw_actions_translate(struct rte_eth_dev *dev,
 				goto err;
 			i++;
 			break;
+		case RTE_FLOW_ACTION_TYPE_COUNT:
+			if (masks->conf &&
+			    ((const struct rte_flow_action_count *)
+			     masks->conf)->id) {
+				err = flow_hw_cnt_compile(dev, i, acts);
+				if (err)
+					goto err;
+			} else if (__flow_hw_act_data_general_append
+					(priv, acts, actions->type,
+					 actions - action_start, i)) {
+				goto err;
+			}
+			i++;
+			break;
 		case RTE_FLOW_ACTION_TYPE_END:
 			actions_end = true;
 			break;
@@ -1377,6 +1465,13 @@  flow_hw_shared_action_construct(struct rte_eth_dev *dev,
 				(dev, &act_data, item_flags, rule_act))
 			return -1;
 		break;
+	case MLX5_INDIRECT_ACTION_TYPE_COUNT:
+		if (mlx5_hws_cnt_pool_get_action_offset(priv->hws_cpool,
+				act_idx,
+				&rule_act->action,
+				&rule_act->counter.offset))
+			return -1;
+		break;
 	default:
 		DRV_LOG(WARNING, "Unsupported shared action type:%d", type);
 		break;
@@ -1520,7 +1615,8 @@  flow_hw_actions_construct(struct rte_eth_dev *dev,
 			  const uint8_t it_idx,
 			  const struct rte_flow_action actions[],
 			  struct mlx5dr_rule_action *rule_acts,
-			  uint32_t *acts_num)
+			  uint32_t *acts_num,
+			  uint32_t queue)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct rte_flow_template_table *table = job->flow->table;
@@ -1574,6 +1670,7 @@  flow_hw_actions_construct(struct rte_eth_dev *dev,
 		uint64_t item_flags;
 		struct mlx5_hw_jump_action *jump;
 		struct mlx5_hrxq *hrxq;
+		cnt_id_t cnt_id;
 
 		action = &actions[act_data->action_src];
 		MLX5_ASSERT(action->type == RTE_FLOW_ACTION_TYPE_INDIRECT ||
@@ -1681,6 +1778,32 @@  flow_hw_actions_construct(struct rte_eth_dev *dev,
 			if (mlx5_aso_mtr_wait(priv->sh, mtr))
 				return -1;
 			break;
+		case RTE_FLOW_ACTION_TYPE_COUNT:
+			ret = mlx5_hws_cnt_pool_get(priv->hws_cpool, &queue,
+					&cnt_id);
+			if (ret != 0)
+				return ret;
+			ret = mlx5_hws_cnt_pool_get_action_offset
+				(priv->hws_cpool,
+				 cnt_id,
+				 &rule_acts[act_data->action_dst].action,
+				 &rule_acts[act_data->action_dst].counter.offset
+				 );
+			if (ret != 0)
+				return ret;
+			job->flow->cnt_id = cnt_id;
+			break;
+		case MLX5_RTE_FLOW_ACTION_TYPE_COUNT:
+			ret = mlx5_hws_cnt_pool_get_action_offset
+				(priv->hws_cpool,
+				 act_data->shared_counter.id,
+				 &rule_acts[act_data->action_dst].action,
+				 &rule_acts[act_data->action_dst].counter.offset
+				 );
+			if (ret != 0)
+				return ret;
+			job->flow->cnt_id = act_data->shared_counter.id;
+			break;
 		default:
 			break;
 		}
@@ -1690,6 +1813,8 @@  flow_hw_actions_construct(struct rte_eth_dev *dev,
 				job->flow->idx - 1;
 		rule_acts[hw_acts->encap_decap_pos].reformat.data = buf;
 	}
+	if (mlx5_hws_cnt_id_valid(hw_acts->cnt_id))
+		job->flow->cnt_id = hw_acts->cnt_id;
 	return 0;
 }
 
@@ -1825,7 +1950,7 @@  flow_hw_async_flow_create(struct rte_eth_dev *dev,
 	 * user's input, in order to save the cost.
 	 */
 	if (flow_hw_actions_construct(dev, job, hw_acts, pattern_template_index,
-				  actions, rule_acts, &acts_num)) {
+				  actions, rule_acts, &acts_num, queue)) {
 		rte_errno = EINVAL;
 		goto free;
 	}
@@ -1955,6 +2080,13 @@  flow_hw_pull(struct rte_eth_dev *dev,
 				flow_hw_jump_release(dev, job->flow->jump);
 			else if (job->flow->fate_type == MLX5_FLOW_FATE_QUEUE)
 				mlx5_hrxq_obj_release(dev, job->flow->hrxq);
+			if (mlx5_hws_cnt_id_valid(job->flow->cnt_id) &&
+			    mlx5_hws_cnt_is_shared
+				(priv->hws_cpool, job->flow->cnt_id) == false) {
+				mlx5_hws_cnt_pool_put(priv->hws_cpool, &queue,
+						&job->flow->cnt_id);
+				job->flow->cnt_id = 0;
+			}
 			mlx5_ipool_free(job->flow->table->flow, job->flow->idx);
 		}
 		priv->hw_q[queue].job[priv->hw_q[queue].job_idx++] = job;
@@ -2678,6 +2810,9 @@  flow_hw_actions_validate(struct rte_eth_dev *dev,
 			if (ret < 0)
 				return ret;
 			break;
+		case RTE_FLOW_ACTION_TYPE_COUNT:
+			/* TODO: Validation logic */
+			break;
 		case RTE_FLOW_ACTION_TYPE_END:
 			actions_end = true;
 			break;
@@ -4355,6 +4490,12 @@  flow_hw_configure(struct rte_eth_dev *dev,
 	}
 	if (_queue_attr)
 		mlx5_free(_queue_attr);
+	if (port_attr->nb_counters) {
+		priv->hws_cpool = mlx5_hws_cnt_pool_create(dev, port_attr,
+				nb_queue);
+		if (priv->hws_cpool == NULL)
+			goto err;
+	}
 	return 0;
 err:
 	flow_hw_free_vport_actions(priv);
@@ -4424,6 +4565,8 @@  flow_hw_resource_release(struct rte_eth_dev *dev)
 		mlx5_ipool_destroy(priv->acts_ipool);
 		priv->acts_ipool = NULL;
 	}
+	if (priv->hws_cpool)
+		mlx5_hws_cnt_pool_destroy(priv->sh, priv->hws_cpool);
 	mlx5_free(priv->hw_q);
 	priv->hw_q = NULL;
 	claim_zero(mlx5dr_context_close(priv->dr_ctx));
@@ -4565,10 +4708,28 @@  flow_hw_action_handle_create(struct rte_eth_dev *dev, uint32_t queue,
 			     void *user_data,
 			     struct rte_flow_error *error)
 {
+	struct rte_flow_action_handle *handle = NULL;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	cnt_id_t cnt_id;
+
 	RTE_SET_USED(queue);
 	RTE_SET_USED(attr);
 	RTE_SET_USED(user_data);
-	return flow_dv_action_create(dev, conf, action, error);
+	switch (action->type) {
+	case RTE_FLOW_ACTION_TYPE_COUNT:
+		if (mlx5_hws_cnt_shared_get(priv->hws_cpool, &cnt_id))
+			rte_flow_error_set(error, ENODEV,
+					RTE_FLOW_ERROR_TYPE_ACTION,
+					NULL,
+					"counter are not configured!");
+		else
+			handle = (struct rte_flow_action_handle *)
+				 (uintptr_t)cnt_id;
+		break;
+	default:
+		handle = flow_dv_action_create(dev, conf, action, error);
+	}
+	return handle;
 }
 
 /**
@@ -4632,10 +4793,172 @@  flow_hw_action_handle_destroy(struct rte_eth_dev *dev, uint32_t queue,
 			      void *user_data,
 			      struct rte_flow_error *error)
 {
+	uint32_t act_idx = (uint32_t)(uintptr_t)handle;
+	uint32_t type = act_idx >> MLX5_INDIRECT_ACTION_TYPE_OFFSET;
+	struct mlx5_priv *priv = dev->data->dev_private;
+
 	RTE_SET_USED(queue);
 	RTE_SET_USED(attr);
 	RTE_SET_USED(user_data);
-	return flow_dv_action_destroy(dev, handle, error);
+	switch (type) {
+	case MLX5_INDIRECT_ACTION_TYPE_COUNT:
+		return mlx5_hws_cnt_shared_put(priv->hws_cpool, &act_idx);
+	default:
+		return flow_dv_action_destroy(dev, handle, error);
+	}
+}
+
+static int
+flow_hw_query_counter(const struct rte_eth_dev *dev, uint32_t counter,
+		      void *data, struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_hws_cnt *cnt;
+	struct rte_flow_query_count *qc = data;
+	uint32_t iidx = mlx5_hws_cnt_iidx(priv->hws_cpool, counter);
+	uint64_t pkts, bytes;
+
+	if (!mlx5_hws_cnt_id_valid(counter))
+		return rte_flow_error_set(error, EINVAL,
+				RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				"counter are not available");
+	cnt = &priv->hws_cpool->pool[iidx];
+	__hws_cnt_query_raw(priv->hws_cpool, counter, &pkts, &bytes);
+	qc->hits_set = 1;
+	qc->bytes_set = 1;
+	qc->hits = pkts - cnt->reset.hits;
+	qc->bytes = bytes - cnt->reset.bytes;
+	if (qc->reset) {
+		cnt->reset.bytes = bytes;
+		cnt->reset.hits = pkts;
+	}
+	return 0;
+}
+
+static int
+flow_hw_query(struct rte_eth_dev *dev,
+	      struct rte_flow *flow __rte_unused,
+	      const struct rte_flow_action *actions __rte_unused,
+	      void *data __rte_unused,
+	      struct rte_flow_error *error __rte_unused)
+{
+	int ret = -EINVAL;
+	struct rte_flow_hw *hw_flow = (struct rte_flow_hw *)flow;
+
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		switch (actions->type) {
+		case RTE_FLOW_ACTION_TYPE_VOID:
+			break;
+		case RTE_FLOW_ACTION_TYPE_COUNT:
+			ret = flow_hw_query_counter(dev, hw_flow->cnt_id, data,
+						  error);
+			break;
+		default:
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "action not supported");
+		}
+	}
+	return ret;
+}
+
+/**
+ * Create indirect action.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] conf
+ *   Shared action configuration.
+ * @param[in] action
+ *   Action specification used to create indirect action.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL. Initialized in case of
+ *   error only.
+ *
+ * @return
+ *   A valid shared action handle in case of success, NULL otherwise and
+ *   rte_errno is set.
+ */
+static struct rte_flow_action_handle *
+flow_hw_action_create(struct rte_eth_dev *dev,
+		       const struct rte_flow_indir_action_conf *conf,
+		       const struct rte_flow_action *action,
+		       struct rte_flow_error *err)
+{
+	return flow_hw_action_handle_create(dev, UINT32_MAX, NULL, conf, action,
+					    NULL, err);
+}
+
+/**
+ * Destroy the indirect action.
+ * Release action related resources on the NIC and the memory.
+ * Lock free, (mutex should be acquired by caller).
+ * Dispatcher for action type specific call.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] handle
+ *   The indirect action object handle to be removed.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL. Initialized in case of
+ *   error only.
+ *
+ * @return
+ *   0 on success, otherwise negative errno value.
+ */
+static int
+flow_hw_action_destroy(struct rte_eth_dev *dev,
+		       struct rte_flow_action_handle *handle,
+		       struct rte_flow_error *error)
+{
+	return flow_hw_action_handle_destroy(dev, UINT32_MAX, NULL, handle,
+			NULL, error);
+}
+
+/**
+ * Updates in place shared action configuration.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] handle
+ *   The indirect action object handle to be updated.
+ * @param[in] update
+ *   Action specification used to modify the action pointed by *handle*.
+ *   *update* could be of same type with the action pointed by the *handle*
+ *   handle argument, or some other structures like a wrapper, depending on
+ *   the indirect action type.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL. Initialized in case of
+ *   error only.
+ *
+ * @return
+ *   0 on success, otherwise negative errno value.
+ */
+static int
+flow_hw_action_update(struct rte_eth_dev *dev,
+		      struct rte_flow_action_handle *handle,
+		      const void *update,
+		      struct rte_flow_error *err)
+{
+	return flow_hw_action_handle_update(dev, UINT32_MAX, NULL, handle,
+			update, NULL, err);
+}
+
+static int
+flow_hw_action_query(struct rte_eth_dev *dev,
+		     const struct rte_flow_action_handle *handle, void *data,
+		     struct rte_flow_error *error)
+{
+	uint32_t act_idx = (uint32_t)(uintptr_t)handle;
+	uint32_t type = act_idx >> MLX5_INDIRECT_ACTION_TYPE_OFFSET;
+
+	switch (type) {
+	case MLX5_INDIRECT_ACTION_TYPE_COUNT:
+		return flow_hw_query_counter(dev, act_idx, data, error);
+	default:
+		return flow_dv_action_query(dev, handle, data, error);
+	}
 }
 
 const struct mlx5_flow_driver_ops mlx5_flow_hw_drv_ops = {
@@ -4657,10 +4980,11 @@  const struct mlx5_flow_driver_ops mlx5_flow_hw_drv_ops = {
 	.async_action_destroy = flow_hw_action_handle_destroy,
 	.async_action_update = flow_hw_action_handle_update,
 	.action_validate = flow_dv_action_validate,
-	.action_create = flow_dv_action_create,
-	.action_destroy = flow_dv_action_destroy,
-	.action_update = flow_dv_action_update,
-	.action_query = flow_dv_action_query,
+	.action_create = flow_hw_action_create,
+	.action_destroy = flow_hw_action_destroy,
+	.action_update = flow_hw_action_update,
+	.action_query = flow_hw_action_query,
+	.query = flow_hw_query,
 };
 
 /**
diff --git a/drivers/net/mlx5/mlx5_hws_cnt.c b/drivers/net/mlx5/mlx5_hws_cnt.c
new file mode 100644
index 0000000000..e2408ef36d
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_hws_cnt.c
@@ -0,0 +1,528 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2020 Mellanox Technologies, Ltd
+ */
+
+#include <stdint.h>
+#include <rte_malloc.h>
+#include <mlx5_malloc.h>
+#include <rte_ring.h>
+#include <mlx5_devx_cmds.h>
+#include <rte_cycles.h>
+
+#if defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_INFINIBAND_VERBS_H)
+
+#include "mlx5_utils.h"
+#include "mlx5_hws_cnt.h"
+
+#define HWS_CNT_CACHE_SZ_DEFAULT 511
+#define HWS_CNT_CACHE_PRELOAD_DEFAULT 254
+#define HWS_CNT_CACHE_FETCH_DEFAULT 254
+#define HWS_CNT_CACHE_THRESHOLD_DEFAULT 254
+#define HWS_CNT_ALLOC_FACTOR_DEFAULT 20
+
+static void
+__hws_cnt_id_load(struct mlx5_hws_cnt_pool *cpool)
+{
+	uint32_t preload;
+	uint32_t q_num = cpool->cache->q_num;
+	uint32_t cnt_num = mlx5_hws_cnt_pool_get_size(cpool);
+	cnt_id_t cnt_id, iidx = 0;
+	uint32_t qidx;
+	struct rte_ring *qcache = NULL;
+
+	/*
+	 * Counter ID order is important for tracking the max number of in used
+	 * counter for querying, which means counter internal index order must
+	 * be from zero to the number user configured, i.e: 0 - 8000000.
+	 * Need to load counter ID in this order into the cache firstly,
+	 * and then the global free list.
+	 * In the end, user fetch the the counter from minimal to the maximum.
+	 */
+	preload = RTE_MIN(cpool->cache->preload_sz, cnt_num / q_num);
+	for (qidx = 0; qidx < q_num; qidx++) {
+		for (; iidx < preload * (qidx + 1); iidx++) {
+			cnt_id = mlx5_hws_cnt_id_gen(cpool, iidx);
+			qcache = cpool->cache->qcache[qidx];
+			if (qcache)
+				rte_ring_enqueue_elem(qcache, &cnt_id,
+						sizeof(cnt_id));
+		}
+	}
+	for (; iidx < cnt_num; iidx++) {
+		cnt_id = mlx5_hws_cnt_id_gen(cpool, iidx);
+		rte_ring_enqueue_elem(cpool->free_list, &cnt_id,
+				sizeof(cnt_id));
+	}
+}
+
+static void
+__mlx5_hws_cnt_svc(struct mlx5_dev_ctx_shared *sh,
+		struct mlx5_hws_cnt_pool *cpool)
+{
+	struct rte_ring *reset_list = cpool->wait_reset_list;
+	struct rte_ring *reuse_list = cpool->reuse_list;
+	uint32_t reset_cnt_num;
+	struct rte_ring_zc_data zcdr = {0};
+	struct rte_ring_zc_data zcdu = {0};
+
+	reset_cnt_num = rte_ring_count(reset_list);
+	do {
+		cpool->query_gen++;
+		mlx5_aso_cnt_query(sh, cpool);
+		zcdr.n1 = 0;
+		zcdu.n1 = 0;
+		rte_ring_enqueue_zc_burst_elem_start(reuse_list,
+				sizeof(cnt_id_t), reset_cnt_num, &zcdu,
+				NULL);
+		rte_ring_dequeue_zc_burst_elem_start(reset_list,
+				sizeof(cnt_id_t), reset_cnt_num, &zcdr,
+				NULL);
+		__hws_cnt_r2rcpy(&zcdu, &zcdr, reset_cnt_num);
+		rte_ring_dequeue_zc_elem_finish(reset_list,
+				reset_cnt_num);
+		rte_ring_enqueue_zc_elem_finish(reuse_list,
+				reset_cnt_num);
+		reset_cnt_num = rte_ring_count(reset_list);
+	} while (reset_cnt_num > 0);
+}
+
+static void
+mlx5_hws_cnt_raw_data_free(struct mlx5_dev_ctx_shared *sh,
+			   struct mlx5_hws_cnt_raw_data_mng *mng)
+{
+	if (mng == NULL)
+		return;
+	sh->cdev->mr_scache.dereg_mr_cb(&mng->mr);
+	mlx5_free(mng->raw);
+	mlx5_free(mng);
+}
+
+__rte_unused
+static struct mlx5_hws_cnt_raw_data_mng *
+mlx5_hws_cnt_raw_data_alloc(struct mlx5_dev_ctx_shared *sh, uint32_t n)
+{
+	struct mlx5_hws_cnt_raw_data_mng *mng = NULL;
+	int ret;
+	size_t sz = n * sizeof(struct flow_counter_stats);
+
+	mng = mlx5_malloc(MLX5_MEM_ANY | MLX5_MEM_ZERO, sizeof(*mng), 0,
+			SOCKET_ID_ANY);
+	if (mng == NULL)
+		goto error;
+	mng->raw = mlx5_malloc(MLX5_MEM_ANY | MLX5_MEM_ZERO, sz, 0,
+			SOCKET_ID_ANY);
+	if (mng->raw == NULL)
+		goto error;
+	ret = sh->cdev->mr_scache.reg_mr_cb(sh->cdev->pd, mng->raw, sz,
+					    &mng->mr);
+	if (ret) {
+		rte_errno = errno;
+		goto error;
+	}
+	return mng;
+error:
+	mlx5_hws_cnt_raw_data_free(sh, mng);
+	return NULL;
+}
+
+static void *
+mlx5_hws_cnt_svc(void *opaque)
+{
+	struct mlx5_dev_ctx_shared *sh =
+		(struct mlx5_dev_ctx_shared *)opaque;
+	uint64_t interval =
+		(uint64_t)sh->cnt_svc->query_interval * (US_PER_S / MS_PER_S);
+	uint16_t port_id;
+	uint64_t start_cycle, query_cycle = 0;
+	uint64_t query_us;
+	uint64_t sleep_us;
+
+	while (sh->cnt_svc->svc_running != 0) {
+		start_cycle = rte_rdtsc();
+		MLX5_ETH_FOREACH_DEV(port_id, sh->cdev->dev) {
+			struct mlx5_priv *opriv =
+				rte_eth_devices[port_id].data->dev_private;
+			if (opriv != NULL &&
+			    opriv->sh == sh &&
+			    opriv->hws_cpool != NULL) {
+				__mlx5_hws_cnt_svc(sh, opriv->hws_cpool);
+			}
+		}
+		query_cycle = rte_rdtsc() - start_cycle;
+		query_us = query_cycle / (rte_get_timer_hz() / US_PER_S);
+		sleep_us = interval - query_us;
+		if (interval > query_us)
+			rte_delay_us_sleep(sleep_us);
+	}
+	return NULL;
+}
+
+struct mlx5_hws_cnt_pool *
+mlx5_hws_cnt_pool_init(const struct mlx5_hws_cnt_pool_cfg *pcfg,
+		const struct mlx5_hws_cache_param *ccfg)
+{
+	char mz_name[RTE_MEMZONE_NAMESIZE];
+	struct mlx5_hws_cnt_pool *cntp;
+	uint64_t cnt_num = 0;
+	uint32_t qidx;
+
+	MLX5_ASSERT(pcfg);
+	MLX5_ASSERT(ccfg);
+	cntp = mlx5_malloc(MLX5_MEM_ANY | MLX5_MEM_ZERO, sizeof(*cntp), 0,
+			   SOCKET_ID_ANY);
+	if (cntp == NULL)
+		return NULL;
+
+	cntp->cfg = *pcfg;
+	cntp->cache = mlx5_malloc(MLX5_MEM_ANY | MLX5_MEM_ZERO,
+			sizeof(*cntp->cache) +
+			sizeof(((struct mlx5_hws_cnt_pool_caches *)0)->qcache[0])
+				* ccfg->q_num, 0, SOCKET_ID_ANY);
+	if (cntp->cache == NULL)
+		goto error;
+	 /* store the necessary cache parameters. */
+	cntp->cache->fetch_sz = ccfg->fetch_sz;
+	cntp->cache->preload_sz = ccfg->preload_sz;
+	cntp->cache->threshold = ccfg->threshold;
+	cntp->cache->q_num = ccfg->q_num;
+	cnt_num = pcfg->request_num * (100 + pcfg->alloc_factor) / 100;
+	if (cnt_num > UINT32_MAX) {
+		DRV_LOG(ERR, "counter number %"PRIu64" is out of 32bit range",
+			cnt_num);
+		goto error;
+	}
+	cntp->pool = mlx5_malloc(MLX5_MEM_ANY | MLX5_MEM_ZERO,
+			sizeof(struct mlx5_hws_cnt) *
+			pcfg->request_num * (100 + pcfg->alloc_factor) / 100,
+			0, SOCKET_ID_ANY);
+	if (cntp->pool == NULL)
+		goto error;
+	snprintf(mz_name, sizeof(mz_name), "%s_F_RING", pcfg->name);
+	cntp->free_list = rte_ring_create_elem(mz_name, sizeof(cnt_id_t),
+			(uint32_t)cnt_num, SOCKET_ID_ANY,
+			RING_F_SP_ENQ | RING_F_MC_HTS_DEQ | RING_F_EXACT_SZ);
+	if (cntp->free_list == NULL) {
+		DRV_LOG(ERR, "failed to create free list ring");
+		goto error;
+	}
+	snprintf(mz_name, sizeof(mz_name), "%s_R_RING", pcfg->name);
+	cntp->wait_reset_list = rte_ring_create_elem(mz_name, sizeof(cnt_id_t),
+			(uint32_t)cnt_num, SOCKET_ID_ANY,
+			RING_F_MP_HTS_ENQ | RING_F_SC_DEQ | RING_F_EXACT_SZ);
+	if (cntp->wait_reset_list == NULL) {
+		DRV_LOG(ERR, "failed to create free list ring");
+		goto error;
+	}
+	snprintf(mz_name, sizeof(mz_name), "%s_U_RING", pcfg->name);
+	cntp->reuse_list = rte_ring_create_elem(mz_name, sizeof(cnt_id_t),
+			(uint32_t)cnt_num, SOCKET_ID_ANY,
+			RING_F_SP_ENQ | RING_F_MC_HTS_DEQ | RING_F_EXACT_SZ);
+	if (cntp->reuse_list == NULL) {
+		DRV_LOG(ERR, "failed to create reuse list ring");
+		goto error;
+	}
+	for (qidx = 0; qidx < ccfg->q_num; qidx++) {
+		snprintf(mz_name, sizeof(mz_name), "%s_cache/%u", pcfg->name,
+				qidx);
+		cntp->cache->qcache[qidx] = rte_ring_create(mz_name, ccfg->size,
+				SOCKET_ID_ANY,
+				RING_F_SP_ENQ | RING_F_SC_DEQ |
+				RING_F_EXACT_SZ);
+		if (cntp->cache->qcache[qidx] == NULL)
+			goto error;
+	}
+	return cntp;
+error:
+	mlx5_hws_cnt_pool_deinit(cntp);
+	return NULL;
+}
+
+void
+mlx5_hws_cnt_pool_deinit(struct mlx5_hws_cnt_pool * const cntp)
+{
+	uint32_t qidx = 0;
+	if (cntp == NULL)
+		return;
+	rte_ring_free(cntp->free_list);
+	rte_ring_free(cntp->wait_reset_list);
+	rte_ring_free(cntp->reuse_list);
+	if (cntp->cache) {
+		for (qidx = 0; qidx < cntp->cache->q_num; qidx++)
+			rte_ring_free(cntp->cache->qcache[qidx]);
+	}
+	mlx5_free(cntp->cache);
+	mlx5_free(cntp->raw_mng);
+	mlx5_free(cntp->pool);
+	mlx5_free(cntp);
+}
+
+int
+mlx5_hws_cnt_service_thread_create(struct mlx5_dev_ctx_shared *sh)
+{
+#define CNT_THREAD_NAME_MAX 256
+	char name[CNT_THREAD_NAME_MAX];
+	rte_cpuset_t cpuset;
+	int ret;
+	uint32_t service_core = sh->cnt_svc->service_core;
+
+	CPU_ZERO(&cpuset);
+	sh->cnt_svc->svc_running = 1;
+	ret = pthread_create(&sh->cnt_svc->service_thread, NULL,
+			mlx5_hws_cnt_svc, sh);
+	if (ret != 0) {
+		DRV_LOG(ERR, "Failed to create HW steering's counter service thread.");
+		return -ENOSYS;
+	}
+	snprintf(name, CNT_THREAD_NAME_MAX - 1, "%s/svc@%d",
+		 sh->ibdev_name, service_core);
+	rte_thread_setname(sh->cnt_svc->service_thread, name);
+	CPU_SET(service_core, &cpuset);
+	pthread_setaffinity_np(sh->cnt_svc->service_thread, sizeof(cpuset),
+				&cpuset);
+	return 0;
+}
+
+void
+mlx5_hws_cnt_service_thread_destroy(struct mlx5_dev_ctx_shared *sh)
+{
+	if (sh->cnt_svc->service_thread == 0)
+		return;
+	sh->cnt_svc->svc_running = 0;
+	pthread_join(sh->cnt_svc->service_thread, NULL);
+	sh->cnt_svc->service_thread = 0;
+}
+
+int
+mlx5_hws_cnt_pool_dcs_alloc(struct mlx5_dev_ctx_shared *sh,
+			    struct mlx5_hws_cnt_pool *cpool)
+{
+	struct mlx5_hca_attr *hca_attr = &sh->cdev->config.hca_attr;
+	uint32_t max_log_bulk_sz = 0;
+	uint32_t log_bulk_sz;
+	uint32_t idx, alloced = 0;
+	unsigned int cnt_num = mlx5_hws_cnt_pool_get_size(cpool);
+	struct mlx5_devx_counter_attr attr = {0};
+	struct mlx5_devx_obj *dcs;
+
+	if (hca_attr->flow_counter_bulk_log_max_alloc == 0) {
+		DRV_LOG(ERR,
+			"Fw doesn't support bulk log max alloc");
+		return -1;
+	}
+	max_log_bulk_sz = 23; /* hard code to 8M (1 << 23). */
+	cnt_num = RTE_ALIGN_CEIL(cnt_num, 4); /* minimal 4 counter in bulk. */
+	log_bulk_sz = RTE_MIN(max_log_bulk_sz, rte_log2_u32(cnt_num));
+	attr.pd = sh->cdev->pdn;
+	attr.pd_valid = 1;
+	attr.bulk_log_max_alloc = 1;
+	attr.flow_counter_bulk_log_size = log_bulk_sz;
+	idx = 0;
+	dcs = mlx5_devx_cmd_flow_counter_alloc_general(sh->cdev->ctx, &attr);
+	if (dcs == NULL)
+		goto error;
+	cpool->dcs_mng.dcs[idx].obj = dcs;
+	cpool->dcs_mng.dcs[idx].batch_sz = (1 << log_bulk_sz);
+	cpool->dcs_mng.batch_total++;
+	idx++;
+	cpool->dcs_mng.dcs[0].iidx = 0;
+	alloced = cpool->dcs_mng.dcs[0].batch_sz;
+	if (cnt_num > cpool->dcs_mng.dcs[0].batch_sz) {
+		for (; idx < MLX5_HWS_CNT_DCS_NUM; idx++) {
+			attr.flow_counter_bulk_log_size = --max_log_bulk_sz;
+			dcs = mlx5_devx_cmd_flow_counter_alloc_general
+				(sh->cdev->ctx, &attr);
+			if (dcs == NULL)
+				goto error;
+			cpool->dcs_mng.dcs[idx].obj = dcs;
+			cpool->dcs_mng.dcs[idx].batch_sz =
+				(1 << max_log_bulk_sz);
+			cpool->dcs_mng.dcs[idx].iidx = alloced;
+			alloced += cpool->dcs_mng.dcs[idx].batch_sz;
+			cpool->dcs_mng.batch_total++;
+		}
+	}
+	return 0;
+error:
+	DRV_LOG(DEBUG,
+		"Cannot alloc device counter, allocated[%" PRIu32 "] request[%" PRIu32 "]",
+		alloced, cnt_num);
+	for (idx = 0; idx < cpool->dcs_mng.batch_total; idx++) {
+		mlx5_devx_cmd_destroy(cpool->dcs_mng.dcs[idx].obj);
+		cpool->dcs_mng.dcs[idx].obj = NULL;
+		cpool->dcs_mng.dcs[idx].batch_sz = 0;
+		cpool->dcs_mng.dcs[idx].iidx = 0;
+	}
+	cpool->dcs_mng.batch_total = 0;
+	return -1;
+}
+
+void
+mlx5_hws_cnt_pool_dcs_free(struct mlx5_dev_ctx_shared *sh,
+			   struct mlx5_hws_cnt_pool *cpool)
+{
+	uint32_t idx;
+
+	if (cpool == NULL)
+		return;
+	for (idx = 0; idx < MLX5_HWS_CNT_DCS_NUM; idx++)
+		mlx5_devx_cmd_destroy(cpool->dcs_mng.dcs[idx].obj);
+	if (cpool->raw_mng) {
+		mlx5_hws_cnt_raw_data_free(sh, cpool->raw_mng);
+		cpool->raw_mng = NULL;
+	}
+}
+
+int
+mlx5_hws_cnt_pool_action_create(struct mlx5_priv *priv,
+		struct mlx5_hws_cnt_pool *cpool)
+{
+	uint32_t idx;
+	int ret = 0;
+	struct mlx5_hws_cnt_dcs *dcs;
+	uint32_t flags;
+
+	flags = MLX5DR_ACTION_FLAG_HWS_RX | MLX5DR_ACTION_FLAG_HWS_TX;
+	if (priv->sh->config.dv_esw_en && priv->master)
+		flags |= MLX5DR_ACTION_FLAG_HWS_FDB;
+	for (idx = 0; idx < cpool->dcs_mng.batch_total; idx++) {
+		dcs = &cpool->dcs_mng.dcs[idx];
+		dcs->dr_action = mlx5dr_action_create_counter(priv->dr_ctx,
+					(struct mlx5dr_devx_obj *)dcs->obj,
+					flags);
+		if (dcs->dr_action == NULL) {
+			mlx5_hws_cnt_pool_action_destroy(cpool);
+			ret = -ENOSYS;
+			break;
+		}
+	}
+	return ret;
+}
+
+void
+mlx5_hws_cnt_pool_action_destroy(struct mlx5_hws_cnt_pool *cpool)
+{
+	uint32_t idx;
+	struct mlx5_hws_cnt_dcs *dcs;
+
+	for (idx = 0; idx < cpool->dcs_mng.batch_total; idx++) {
+		dcs = &cpool->dcs_mng.dcs[idx];
+		if (dcs->dr_action != NULL) {
+			mlx5dr_action_destroy(dcs->dr_action);
+			dcs->dr_action = NULL;
+		}
+	}
+}
+
+struct mlx5_hws_cnt_pool *
+mlx5_hws_cnt_pool_create(struct rte_eth_dev *dev,
+		const struct rte_flow_port_attr *pattr, uint16_t nb_queue)
+{
+	struct mlx5_hws_cnt_pool *cpool = NULL;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_hws_cache_param cparam = {0};
+	struct mlx5_hws_cnt_pool_cfg pcfg = {0};
+	char *mp_name;
+	int ret = 0;
+	size_t sz;
+
+	/* init cnt service if not. */
+	if (priv->sh->cnt_svc == NULL) {
+		ret = mlx5_hws_cnt_svc_init(priv->sh);
+		if (ret != 0)
+			return NULL;
+	}
+	cparam.fetch_sz = HWS_CNT_CACHE_FETCH_DEFAULT;
+	cparam.preload_sz = HWS_CNT_CACHE_PRELOAD_DEFAULT;
+	cparam.q_num = nb_queue;
+	cparam.threshold = HWS_CNT_CACHE_THRESHOLD_DEFAULT;
+	cparam.size = HWS_CNT_CACHE_SZ_DEFAULT;
+	pcfg.alloc_factor = HWS_CNT_ALLOC_FACTOR_DEFAULT;
+	mp_name = mlx5_malloc(MLX5_MEM_ZERO, RTE_MEMZONE_NAMESIZE, 0,
+			SOCKET_ID_ANY);
+	if (mp_name == NULL)
+		goto error;
+	snprintf(mp_name, RTE_MEMZONE_NAMESIZE, "MLX5_HWS_CNT_POOL_%u",
+			dev->data->port_id);
+	pcfg.name = mp_name;
+	pcfg.request_num = pattr->nb_counters;
+	cpool = mlx5_hws_cnt_pool_init(&pcfg, &cparam);
+	if (cpool == NULL)
+		goto error;
+	ret = mlx5_hws_cnt_pool_dcs_alloc(priv->sh, cpool);
+	if (ret != 0)
+		goto error;
+	sz = RTE_ALIGN_CEIL(mlx5_hws_cnt_pool_get_size(cpool), 4);
+	cpool->raw_mng = mlx5_hws_cnt_raw_data_alloc(priv->sh, sz);
+	if (cpool->raw_mng == NULL)
+		goto error;
+	__hws_cnt_id_load(cpool);
+	/*
+	 * Bump query gen right after pool create so the
+	 * pre-loaded counters can be used directly
+	 * because they already have init value no need
+	 * to wait for query.
+	 */
+	cpool->query_gen = 1;
+	ret = mlx5_hws_cnt_pool_action_create(priv, cpool);
+	if (ret != 0)
+		goto error;
+	priv->sh->cnt_svc->refcnt++;
+	return cpool;
+error:
+	mlx5_hws_cnt_pool_destroy(priv->sh, cpool);
+	return NULL;
+}
+
+void
+mlx5_hws_cnt_pool_destroy(struct mlx5_dev_ctx_shared *sh,
+		struct mlx5_hws_cnt_pool *cpool)
+{
+	if (cpool == NULL)
+		return;
+	if (--sh->cnt_svc->refcnt == 0)
+		mlx5_hws_cnt_svc_deinit(sh);
+	mlx5_hws_cnt_pool_action_destroy(cpool);
+	mlx5_hws_cnt_pool_dcs_free(sh, cpool);
+	mlx5_hws_cnt_raw_data_free(sh, cpool->raw_mng);
+	mlx5_free((void *)cpool->cfg.name);
+	mlx5_hws_cnt_pool_deinit(cpool);
+}
+
+int
+mlx5_hws_cnt_svc_init(struct mlx5_dev_ctx_shared *sh)
+{
+	int ret;
+
+	sh->cnt_svc = mlx5_malloc(MLX5_MEM_ANY | MLX5_MEM_ZERO,
+			sizeof(*sh->cnt_svc), 0, SOCKET_ID_ANY);
+	if (sh->cnt_svc == NULL)
+		return -1;
+	sh->cnt_svc->query_interval = sh->config.cnt_svc.cycle_time;
+	sh->cnt_svc->service_core = sh->config.cnt_svc.service_core;
+	ret = mlx5_aso_cnt_queue_init(sh);
+	if (ret != 0) {
+		mlx5_free(sh->cnt_svc);
+		sh->cnt_svc = NULL;
+		return -1;
+	}
+	ret = mlx5_hws_cnt_service_thread_create(sh);
+	if (ret != 0) {
+		mlx5_aso_cnt_queue_uninit(sh);
+		mlx5_free(sh->cnt_svc);
+		sh->cnt_svc = NULL;
+	}
+	return 0;
+}
+
+void
+mlx5_hws_cnt_svc_deinit(struct mlx5_dev_ctx_shared *sh)
+{
+	if (sh->cnt_svc == NULL)
+		return;
+	mlx5_hws_cnt_service_thread_destroy(sh);
+	mlx5_aso_cnt_queue_uninit(sh);
+	mlx5_free(sh->cnt_svc);
+	sh->cnt_svc = NULL;
+}
+
+#endif
diff --git a/drivers/net/mlx5/mlx5_hws_cnt.h b/drivers/net/mlx5/mlx5_hws_cnt.h
new file mode 100644
index 0000000000..5fab4ba597
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_hws_cnt.h
@@ -0,0 +1,558 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Mellanox Technologies, Ltd
+ */
+
+#ifndef _MLX5_HWS_CNT_H_
+#define _MLX5_HWS_CNT_H_
+
+#include <rte_ring.h>
+#include "mlx5_utils.h"
+#include "mlx5_flow.h"
+
+/*
+ * COUNTER ID's layout
+ *       3                   2                   1                   0
+ *     1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *    | T |       | D |                                               |
+ *    ~ Y |       | C |                    IDX                        ~
+ *    | P |       | S |                                               |
+ *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ *    Bit 31:30 = TYPE = MLX5_INDIRECT_ACTION_TYPE_COUNT = b'10
+ *    Bit 25:24 = DCS index
+ *    Bit 23:00 = IDX in this counter belonged DCS bulk.
+ */
+typedef uint32_t cnt_id_t;
+
+#define MLX5_HWS_CNT_DCS_NUM 4
+#define MLX5_HWS_CNT_DCS_IDX_OFFSET 24
+#define MLX5_HWS_CNT_DCS_IDX_MASK 0x3
+#define MLX5_HWS_CNT_IDX_MASK ((1UL << MLX5_HWS_CNT_DCS_IDX_OFFSET) - 1)
+
+struct mlx5_hws_cnt_dcs {
+	void *dr_action;
+	uint32_t batch_sz;
+	uint32_t iidx; /* internal index of first counter in this bulk. */
+	struct mlx5_devx_obj *obj;
+};
+
+struct mlx5_hws_cnt_dcs_mng {
+	uint32_t batch_total;
+	struct mlx5_hws_cnt_dcs dcs[MLX5_HWS_CNT_DCS_NUM];
+};
+
+struct mlx5_hws_cnt {
+	struct flow_counter_stats reset;
+	union {
+		uint32_t share: 1;
+		/*
+		 * share will be set to 1 when this counter is used as indirect
+		 * action. Only meaningful when user own this counter.
+		 */
+		uint32_t query_gen_when_free;
+		/*
+		 * When PMD own this counter (user put back counter to PMD
+		 * counter pool, i.e), this field recorded value of counter
+		 * pools query generation at time user release the counter.
+		 */
+	};
+};
+
+struct mlx5_hws_cnt_raw_data_mng {
+	struct flow_counter_stats *raw;
+	struct mlx5_pmd_mr mr;
+};
+
+struct mlx5_hws_cache_param {
+	uint32_t size;
+	uint32_t q_num;
+	uint32_t fetch_sz;
+	uint32_t threshold;
+	uint32_t preload_sz;
+};
+
+struct mlx5_hws_cnt_pool_cfg {
+	char *name;
+	uint32_t request_num;
+	uint32_t alloc_factor;
+};
+
+struct mlx5_hws_cnt_pool_caches {
+	uint32_t fetch_sz;
+	uint32_t threshold;
+	uint32_t preload_sz;
+	uint32_t q_num;
+	struct rte_ring *qcache[];
+};
+
+struct mlx5_hws_cnt_pool {
+	struct mlx5_hws_cnt_pool_cfg cfg __rte_cache_aligned;
+	struct mlx5_hws_cnt_dcs_mng dcs_mng __rte_cache_aligned;
+	uint32_t query_gen __rte_cache_aligned;
+	struct mlx5_hws_cnt *pool;
+	struct mlx5_hws_cnt_raw_data_mng *raw_mng;
+	struct rte_ring *reuse_list;
+	struct rte_ring *free_list;
+	struct rte_ring *wait_reset_list;
+	struct mlx5_hws_cnt_pool_caches *cache;
+} __rte_cache_aligned;
+
+/**
+ * Translate counter id into internal index (start from 0), which can be used
+ * as index of raw/cnt pool.
+ *
+ * @param cnt_id
+ *   The external counter id
+ * @return
+ *   Internal index
+ */
+static __rte_always_inline cnt_id_t
+mlx5_hws_cnt_iidx(struct mlx5_hws_cnt_pool *cpool, cnt_id_t cnt_id)
+{
+	uint8_t dcs_idx = cnt_id >> MLX5_HWS_CNT_DCS_IDX_OFFSET;
+	uint32_t offset = cnt_id & MLX5_HWS_CNT_IDX_MASK;
+
+	dcs_idx &= MLX5_HWS_CNT_DCS_IDX_MASK;
+	return (cpool->dcs_mng.dcs[dcs_idx].iidx + offset);
+}
+
+/**
+ * Check if it's valid counter id.
+ */
+static __rte_always_inline bool
+mlx5_hws_cnt_id_valid(cnt_id_t cnt_id)
+{
+	return (cnt_id >> MLX5_INDIRECT_ACTION_TYPE_OFFSET) ==
+		MLX5_INDIRECT_ACTION_TYPE_COUNT ? true : false;
+}
+
+/**
+ * Generate Counter id from internal index.
+ *
+ * @param cpool
+ *   The pointer to counter pool
+ * @param index
+ *   The internal counter index.
+ *
+ * @return
+ *   Counter id
+ */
+static __rte_always_inline cnt_id_t
+mlx5_hws_cnt_id_gen(struct mlx5_hws_cnt_pool *cpool, cnt_id_t iidx)
+{
+	struct mlx5_hws_cnt_dcs_mng *dcs_mng = &cpool->dcs_mng;
+	uint32_t idx;
+	uint32_t offset;
+	cnt_id_t cnt_id;
+
+	for (idx = 0, offset = iidx; idx < dcs_mng->batch_total; idx++) {
+		if (dcs_mng->dcs[idx].batch_sz <= offset)
+			offset -= dcs_mng->dcs[idx].batch_sz;
+		else
+			break;
+	}
+	cnt_id = offset;
+	cnt_id |= (idx << MLX5_HWS_CNT_DCS_IDX_OFFSET);
+	return (MLX5_INDIRECT_ACTION_TYPE_COUNT <<
+			MLX5_INDIRECT_ACTION_TYPE_OFFSET) | cnt_id;
+}
+
+static __rte_always_inline void
+__hws_cnt_query_raw(struct mlx5_hws_cnt_pool *cpool, cnt_id_t cnt_id,
+		uint64_t *raw_pkts, uint64_t *raw_bytes)
+{
+	struct mlx5_hws_cnt_raw_data_mng *raw_mng = cpool->raw_mng;
+	struct flow_counter_stats s[2];
+	uint8_t i = 0x1;
+	size_t stat_sz = sizeof(s[0]);
+	uint32_t iidx = mlx5_hws_cnt_iidx(cpool, cnt_id);
+
+	memcpy(&s[0], &raw_mng->raw[iidx], stat_sz);
+	do {
+		memcpy(&s[i & 1], &raw_mng->raw[iidx], stat_sz);
+		if (memcmp(&s[0], &s[1], stat_sz) == 0) {
+			*raw_pkts = rte_be_to_cpu_64(s[0].hits);
+			*raw_bytes = rte_be_to_cpu_64(s[0].bytes);
+			break;
+		}
+		i = ~i;
+	} while (1);
+}
+
+/**
+ * Copy elems from one zero-copy ring to zero-copy ring in place.
+ *
+ * The input is a rte ring zero-copy data struct, which has two pointer.
+ * in case of the wrapper happened, the ptr2 will be meaningful.
+ *
+ * So this rountin needs to consider the situation that the address given by
+ * source and destination could be both wrapped.
+ * First, calculate the first number of element needs to be copied until wrapped
+ * address, which could be in source or destination.
+ * Second, copy left number of element until second wrapped address. If in first
+ * step the wrapped address is source, then this time it must be in destination.
+ * and vice-vers.
+ * Third, copy all left numbe of element.
+ *
+ * In worst case, we need copy three pieces of continuous memory.
+ *
+ * @param zcdd
+ *   A pointer to zero-copy data of dest ring.
+ * @param zcds
+ *   A pointer to zero-copy data of source ring.
+ * @param n
+ *   Number of elems to copy.
+ */
+static __rte_always_inline void
+__hws_cnt_r2rcpy(struct rte_ring_zc_data *zcdd, struct rte_ring_zc_data *zcds,
+		unsigned int n)
+{
+	unsigned int n1, n2, n3;
+	void *s1, *s2, *s3;
+	void *d1, *d2, *d3;
+
+	s1 = zcds->ptr1;
+	d1 = zcdd->ptr1;
+	n1 = RTE_MIN(zcdd->n1, zcds->n1);
+	if (zcds->n1 > n1) {
+		n2 = zcds->n1 - n1;
+		s2 = RTE_PTR_ADD(zcds->ptr1, sizeof(cnt_id_t) * n1);
+		d2 = zcdd->ptr2;
+		n3 = n - n1 - n2;
+		s3 = zcds->ptr2;
+		d3 = RTE_PTR_ADD(zcdd->ptr2, sizeof(cnt_id_t) * n2);
+	} else {
+		n2 = zcdd->n1 - n1;
+		s2 = zcds->ptr2;
+		d2 = RTE_PTR_ADD(zcdd->ptr1, sizeof(cnt_id_t) * n1);
+		n3 = n - n1 - n2;
+		s3 = RTE_PTR_ADD(zcds->ptr2, sizeof(cnt_id_t) * n2);
+		d3 = zcdd->ptr2;
+	}
+	memcpy(d1, s1, n1 * sizeof(cnt_id_t));
+	if (n2 != 0) {
+		memcpy(d2, s2, n2 * sizeof(cnt_id_t));
+		if (n3 != 0)
+			memcpy(d3, s3, n3 * sizeof(cnt_id_t));
+	}
+}
+
+static __rte_always_inline int
+mlx5_hws_cnt_pool_cache_flush(struct mlx5_hws_cnt_pool *cpool,
+			      uint32_t queue_id)
+{
+	unsigned int ret;
+	struct rte_ring_zc_data zcdr = {0};
+	struct rte_ring_zc_data zcdc = {0};
+	struct rte_ring *reset_list = NULL;
+	struct rte_ring *qcache = cpool->cache->qcache[queue_id];
+
+	ret = rte_ring_dequeue_zc_burst_elem_start(qcache,
+			sizeof(cnt_id_t), rte_ring_count(qcache), &zcdc,
+			NULL);
+	MLX5_ASSERT(ret);
+	reset_list = cpool->wait_reset_list;
+	rte_ring_enqueue_zc_burst_elem_start(reset_list,
+			sizeof(cnt_id_t), ret, &zcdr, NULL);
+	__hws_cnt_r2rcpy(&zcdr, &zcdc, ret);
+	rte_ring_enqueue_zc_elem_finish(reset_list, ret);
+	rte_ring_dequeue_zc_elem_finish(qcache, ret);
+	return 0;
+}
+
+static __rte_always_inline int
+mlx5_hws_cnt_pool_cache_fetch(struct mlx5_hws_cnt_pool *cpool,
+			      uint32_t queue_id)
+{
+	struct rte_ring *qcache = cpool->cache->qcache[queue_id];
+	struct rte_ring *free_list = NULL;
+	struct rte_ring *reuse_list = NULL;
+	struct rte_ring *list = NULL;
+	struct rte_ring_zc_data zcdf = {0};
+	struct rte_ring_zc_data zcdc = {0};
+	struct rte_ring_zc_data zcdu = {0};
+	struct rte_ring_zc_data zcds = {0};
+	struct mlx5_hws_cnt_pool_caches *cache = cpool->cache;
+	unsigned int ret;
+
+	reuse_list = cpool->reuse_list;
+	ret = rte_ring_dequeue_zc_burst_elem_start(reuse_list,
+			sizeof(cnt_id_t), cache->fetch_sz, &zcdu, NULL);
+	zcds = zcdu;
+	list = reuse_list;
+	if (unlikely(ret == 0)) { /* no reuse counter. */
+		rte_ring_dequeue_zc_elem_finish(reuse_list, 0);
+		free_list = cpool->free_list;
+		ret = rte_ring_dequeue_zc_burst_elem_start(free_list,
+				sizeof(cnt_id_t), cache->fetch_sz, &zcdf, NULL);
+		zcds = zcdf;
+		list = free_list;
+		if (unlikely(ret == 0)) { /* no free counter. */
+			rte_ring_dequeue_zc_elem_finish(free_list, 0);
+			if (rte_ring_count(cpool->wait_reset_list))
+				return -EAGAIN;
+			return -ENOENT;
+		}
+	}
+	rte_ring_enqueue_zc_burst_elem_start(qcache, sizeof(cnt_id_t),
+			ret, &zcdc, NULL);
+	__hws_cnt_r2rcpy(&zcdc, &zcds, ret);
+	rte_ring_dequeue_zc_elem_finish(list, ret);
+	rte_ring_enqueue_zc_elem_finish(qcache, ret);
+	return 0;
+}
+
+static __rte_always_inline int
+__mlx5_hws_cnt_pool_enqueue_revert(struct rte_ring *r, unsigned int n,
+		struct rte_ring_zc_data *zcd)
+{
+	uint32_t current_head = 0;
+	uint32_t revert2head = 0;
+
+	MLX5_ASSERT(r->prod.sync_type == RTE_RING_SYNC_ST);
+	MLX5_ASSERT(r->cons.sync_type == RTE_RING_SYNC_ST);
+	current_head = __atomic_load_n(&r->prod.head, __ATOMIC_RELAXED);
+	MLX5_ASSERT(n <= r->capacity);
+	MLX5_ASSERT(n <= rte_ring_count(r));
+	revert2head = current_head - n;
+	r->prod.head = revert2head; /* This ring should be SP. */
+	__rte_ring_get_elem_addr(r, revert2head, sizeof(cnt_id_t), n,
+			&zcd->ptr1, &zcd->n1, &zcd->ptr2);
+	/* Update tail */
+	__atomic_store_n(&r->prod.tail, revert2head, __ATOMIC_RELEASE);
+	return n;
+}
+
+/**
+ * Put one counter back in the mempool.
+ *
+ * @param cpool
+ *   A pointer to the counter pool structure.
+ * @param cnt_id
+ *   A counter id to be added.
+ * @return
+ *   - 0: Success; object taken
+ *   - -ENOENT: not enough entry in pool
+ */
+static __rte_always_inline int
+mlx5_hws_cnt_pool_put(struct mlx5_hws_cnt_pool *cpool,
+		uint32_t *queue, cnt_id_t *cnt_id)
+{
+	unsigned int ret = 0;
+	struct rte_ring_zc_data zcdc = {0};
+	struct rte_ring_zc_data zcdr = {0};
+	struct rte_ring *qcache = NULL;
+	unsigned int wb_num = 0; /* cache write-back number. */
+	cnt_id_t iidx;
+
+	iidx = mlx5_hws_cnt_iidx(cpool, *cnt_id);
+	cpool->pool[iidx].query_gen_when_free =
+		__atomic_load_n(&cpool->query_gen, __ATOMIC_RELAXED);
+	if (likely(queue != NULL))
+		qcache = cpool->cache->qcache[*queue];
+	if (unlikely(qcache == NULL)) {
+		ret = rte_ring_enqueue_elem(cpool->wait_reset_list, cnt_id,
+				sizeof(cnt_id_t));
+		MLX5_ASSERT(ret == 0);
+		return ret;
+	}
+	ret = rte_ring_enqueue_burst_elem(qcache, cnt_id, sizeof(cnt_id_t), 1,
+					  NULL);
+	if (unlikely(ret == 0)) { /* cache is full. */
+		wb_num = rte_ring_count(qcache) - cpool->cache->threshold;
+		MLX5_ASSERT(wb_num < rte_ring_count(qcache));
+		__mlx5_hws_cnt_pool_enqueue_revert(qcache, wb_num, &zcdc);
+		rte_ring_enqueue_zc_burst_elem_start(cpool->wait_reset_list,
+				sizeof(cnt_id_t), wb_num, &zcdr, NULL);
+		__hws_cnt_r2rcpy(&zcdr, &zcdc, wb_num);
+		rte_ring_enqueue_zc_elem_finish(cpool->wait_reset_list, wb_num);
+		/* write-back THIS counter too */
+		ret = rte_ring_enqueue_burst_elem(cpool->wait_reset_list,
+				cnt_id, sizeof(cnt_id_t), 1, NULL);
+	}
+	return ret == 1 ? 0 : -ENOENT;
+}
+
+/**
+ * Get one counter from the pool.
+ *
+ * If @param queue is not null, objects will be retrieved first from queue's
+ * cache, subsequently from the common pool. Note that it can return -ENOENT
+ * when the local cache and common pool are empty, even if cache from other
+ * queue are full.
+ *
+ * @param cntp
+ *   A pointer to the counter pool structure.
+ * @param queue
+ *   A pointer to HWS queue. If null, it means fetch from common pool.
+ * @param cnt_id
+ *   A pointer to a cnt_id_t * pointer (counter id) that will be filled.
+ * @return
+ *   - 0: Success; objects taken.
+ *   - -ENOENT: Not enough entries in the mempool; no object is retrieved.
+ *   - -EAGAIN: counter is not ready; try again.
+ */
+static __rte_always_inline int
+mlx5_hws_cnt_pool_get(struct mlx5_hws_cnt_pool *cpool,
+		uint32_t *queue, cnt_id_t *cnt_id)
+{
+	unsigned int ret;
+	struct rte_ring_zc_data zcdc = {0};
+	struct rte_ring *qcache = NULL;
+	uint32_t query_gen = 0;
+	cnt_id_t iidx, tmp_cid = 0;
+
+	if (likely(queue != NULL))
+		qcache = cpool->cache->qcache[*queue];
+	if (unlikely(qcache == NULL)) {
+		ret = rte_ring_dequeue_elem(cpool->reuse_list, &tmp_cid,
+				sizeof(cnt_id_t));
+		if (unlikely(ret != 0)) {
+			ret = rte_ring_dequeue_elem(cpool->free_list, &tmp_cid,
+					sizeof(cnt_id_t));
+			if (unlikely(ret != 0)) {
+				if (rte_ring_count(cpool->wait_reset_list))
+					return -EAGAIN;
+				return -ENOENT;
+			}
+		}
+		*cnt_id = tmp_cid;
+		iidx = mlx5_hws_cnt_iidx(cpool, *cnt_id);
+		__hws_cnt_query_raw(cpool, *cnt_id,
+				    &cpool->pool[iidx].reset.hits,
+				    &cpool->pool[iidx].reset.bytes);
+		return 0;
+	}
+	ret = rte_ring_dequeue_zc_burst_elem_start(qcache, sizeof(cnt_id_t), 1,
+			&zcdc, NULL);
+	if (unlikely(ret == 0)) { /* local cache is empty. */
+		rte_ring_dequeue_zc_elem_finish(qcache, 0);
+		/* let's fetch from global free list. */
+		ret = mlx5_hws_cnt_pool_cache_fetch(cpool, *queue);
+		if (unlikely(ret != 0))
+			return ret;
+		rte_ring_dequeue_zc_burst_elem_start(qcache, sizeof(cnt_id_t),
+				1, &zcdc, NULL);
+	}
+	/* get one from local cache. */
+	*cnt_id = (*(cnt_id_t *)zcdc.ptr1);
+	iidx = mlx5_hws_cnt_iidx(cpool, *cnt_id);
+	query_gen = cpool->pool[iidx].query_gen_when_free;
+	if (cpool->query_gen == query_gen) { /* counter is waiting to reset. */
+		rte_ring_dequeue_zc_elem_finish(qcache, 0);
+		/* write-back counter to reset list. */
+		mlx5_hws_cnt_pool_cache_flush(cpool, *queue);
+		/* let's fetch from global free list. */
+		ret = mlx5_hws_cnt_pool_cache_fetch(cpool, *queue);
+		if (unlikely(ret != 0))
+			return ret;
+		rte_ring_dequeue_zc_burst_elem_start(qcache, sizeof(cnt_id_t),
+				1, &zcdc, NULL);
+		*cnt_id = *(cnt_id_t *)zcdc.ptr1;
+	}
+	__hws_cnt_query_raw(cpool, *cnt_id, &cpool->pool[iidx].reset.hits,
+			    &cpool->pool[iidx].reset.bytes);
+	rte_ring_dequeue_zc_elem_finish(qcache, 1);
+	cpool->pool[iidx].share = 0;
+	return 0;
+}
+
+static __rte_always_inline unsigned int
+mlx5_hws_cnt_pool_get_size(struct mlx5_hws_cnt_pool *cpool)
+{
+	return rte_ring_get_capacity(cpool->free_list);
+}
+
+static __rte_always_inline int
+mlx5_hws_cnt_pool_get_action_offset(struct mlx5_hws_cnt_pool *cpool,
+		cnt_id_t cnt_id, struct mlx5dr_action **action,
+		uint32_t *offset)
+{
+	uint8_t idx = cnt_id >> MLX5_HWS_CNT_DCS_IDX_OFFSET;
+
+	idx &= MLX5_HWS_CNT_DCS_IDX_MASK;
+	*action = cpool->dcs_mng.dcs[idx].dr_action;
+	*offset = cnt_id & MLX5_HWS_CNT_IDX_MASK;
+	return 0;
+}
+
+static __rte_always_inline int
+mlx5_hws_cnt_shared_get(struct mlx5_hws_cnt_pool *cpool, cnt_id_t *cnt_id)
+{
+	int ret;
+	uint32_t iidx;
+
+	ret = mlx5_hws_cnt_pool_get(cpool, NULL, cnt_id);
+	if (ret != 0)
+		return ret;
+	iidx = mlx5_hws_cnt_iidx(cpool, *cnt_id);
+	MLX5_ASSERT(cpool->pool[iidx].share == 0);
+	cpool->pool[iidx].share = 1;
+	return 0;
+}
+
+static __rte_always_inline int
+mlx5_hws_cnt_shared_put(struct mlx5_hws_cnt_pool *cpool, cnt_id_t *cnt_id)
+{
+	int ret;
+	uint32_t iidx = mlx5_hws_cnt_iidx(cpool, *cnt_id);
+
+	cpool->pool[iidx].share = 0;
+	ret = mlx5_hws_cnt_pool_put(cpool, NULL, cnt_id);
+	if (unlikely(ret != 0))
+		cpool->pool[iidx].share = 1; /* fail to release, restore. */
+	return ret;
+}
+
+static __rte_always_inline bool
+mlx5_hws_cnt_is_shared(struct mlx5_hws_cnt_pool *cpool, cnt_id_t cnt_id)
+{
+	uint32_t iidx = mlx5_hws_cnt_iidx(cpool, cnt_id);
+
+	return cpool->pool[iidx].share ? true : false;
+}
+
+/* init HWS counter pool. */
+struct mlx5_hws_cnt_pool *
+mlx5_hws_cnt_pool_init(const struct mlx5_hws_cnt_pool_cfg *pcfg,
+		const struct mlx5_hws_cache_param *ccfg);
+
+void
+mlx5_hws_cnt_pool_deinit(struct mlx5_hws_cnt_pool *cntp);
+
+int
+mlx5_hws_cnt_service_thread_create(struct mlx5_dev_ctx_shared *sh);
+
+void
+mlx5_hws_cnt_service_thread_destroy(struct mlx5_dev_ctx_shared *sh);
+
+int
+mlx5_hws_cnt_pool_dcs_alloc(struct mlx5_dev_ctx_shared *sh,
+		struct mlx5_hws_cnt_pool *cpool);
+void
+mlx5_hws_cnt_pool_dcs_free(struct mlx5_dev_ctx_shared *sh,
+		struct mlx5_hws_cnt_pool *cpool);
+
+int
+mlx5_hws_cnt_pool_action_create(struct mlx5_priv *priv,
+		struct mlx5_hws_cnt_pool *cpool);
+
+void
+mlx5_hws_cnt_pool_action_destroy(struct mlx5_hws_cnt_pool *cpool);
+
+struct mlx5_hws_cnt_pool *
+mlx5_hws_cnt_pool_create(struct rte_eth_dev *dev,
+		const struct rte_flow_port_attr *pattr, uint16_t nb_queue);
+
+void
+mlx5_hws_cnt_pool_destroy(struct mlx5_dev_ctx_shared *sh,
+		struct mlx5_hws_cnt_pool *cpool);
+
+int
+mlx5_hws_cnt_svc_init(struct mlx5_dev_ctx_shared *sh);
+
+void
+mlx5_hws_cnt_svc_deinit(struct mlx5_dev_ctx_shared *sh);
+
+#endif /* _MLX5_HWS_CNT_H_ */