[1/4] net/mlx5: accelerate DV flow counter transactions
diff mbox series

Message ID 1562594861-27123-2-git-send-email-matan@mellanox.com
State Superseded, archived
Delegated to: Raslan Darawsheh
Headers show
Series
  • net/mlx5: accelerate DV flow counters mangement
Related show

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/mellanox-Performance-Testing success Performance Testing PASS
ci/intel-Performance-Testing success Performance Testing PASS
ci/Intel-compilation fail Compilation issues

Commit Message

Matan Azrad July 8, 2019, 2:07 p.m. UTC
The DevX interface exposes a new feature to the PMD that can allocate a
batch of counters by one FW command. It can improve the flow
transaction rate (with count action).

Add a new counter pools mechanism to manage HW counters in the PMD.
So, for each flow with counter creation the PMD will try to find a free
counter in the PMD pools container and only if there is no a free
counter, it will allocate a new DevX batch counters.

Currently we cannot support batch counter for a group 0 flow, so
create a 2 container types, one which allocates counters one by
one and one which allocates X counters by the batch feature.

The allocated counters objects are never released back to the HW
assuming the flows maximum number will be close to the actual value of
the flows number.
Later, it can be updated, and dynamic release mechanism can be added.

The counters are contained in pools, each pool with 512 counters.
The pools are contained in counter containers according to the
allocation resolution type - single or batch.
The cache memory of the counters statistics is saved as raw data per
pool.
All the raw data memory is allocated for all the container in one
memory allocation and is managed by counter_stats_mem_mng structure
which registers all the raw memory to the HW.
Each pool points to one raw data structure.

The query operation is in pool resolution which updates all the pool
counter raw data by one operation.

Signed-off-by: Matan Azrad <matan@mellanox.com>
Acked-by: Shahaf Shuler <shahafs@mellanox.com>
---
 drivers/net/mlx5/Makefile          |   2 +-
 drivers/net/mlx5/mlx5.c            |  85 +++++++
 drivers/net/mlx5/mlx5.h            | 115 ++++++++-
 drivers/net/mlx5/mlx5_devx_cmds.c  | 185 ++++++++++----
 drivers/net/mlx5/mlx5_flow.h       |  19 --
 drivers/net/mlx5/mlx5_flow_dv.c    | 485 ++++++++++++++++++++++++++++++++-----
 drivers/net/mlx5/mlx5_flow_verbs.c |  15 +-
 drivers/net/mlx5/mlx5_glue.c       |  29 +++
 drivers/net/mlx5/mlx5_glue.h       |   5 +
 drivers/net/mlx5/mlx5_prm.h        | 112 ++++++++-
 10 files changed, 901 insertions(+), 151 deletions(-)

Patch
diff mbox series

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 619e6b6..b210c80 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -8,7 +8,7 @@  include $(RTE_SDK)/mk/rte.vars.mk
 LIB = librte_pmd_mlx5.a
 LIB_GLUE = $(LIB_GLUE_BASE).$(LIB_GLUE_VERSION)
 LIB_GLUE_BASE = librte_pmd_mlx5_glue.so
-LIB_GLUE_VERSION = 19.05.0
+LIB_GLUE_VERSION = 19.08.0
 
 # Sources.
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5.c
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index d93f92d..62be141 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -157,6 +157,89 @@  struct mlx5_dev_spawn_data {
 static pthread_mutex_t mlx5_ibv_list_mutex = PTHREAD_MUTEX_INITIALIZER;
 
 /**
+ * Initialize the counters management structure.
+ *
+ * @param[in] sh
+ *   Pointer to mlx5_ibv_shared object to free
+ */
+static void
+mlx5_flow_counters_mng_init(struct mlx5_ibv_shared *sh)
+{
+	uint8_t i;
+
+	TAILQ_INIT(&sh->cmng.flow_counters);
+	for (i = 0; i < RTE_DIM(sh->cmng.ccont); ++i)
+		TAILQ_INIT(&sh->cmng.ccont[i].pool_list);
+}
+
+/**
+ * Destroy all the resources allocated for a counter memory management.
+ *
+ * @param[in] mng
+ *   Pointer to the memory management structure.
+ */
+static void
+mlx5_flow_destroy_counter_stat_mem_mng(struct mlx5_counter_stats_mem_mng *mng)
+{
+	uint8_t *mem = (uint8_t *)(uintptr_t)mng->raws[0].data;
+
+	LIST_REMOVE(mng, next);
+	claim_zero(mlx5_devx_cmd_destroy(mng->dm));
+	claim_zero(mlx5_glue->devx_umem_dereg(mng->umem));
+	rte_free(mem);
+}
+
+/**
+ * Close and release all the resources of the counters management.
+ *
+ * @param[in] sh
+ *   Pointer to mlx5_ibv_shared object to free.
+ */
+static void
+mlx5_flow_counters_mng_close(struct mlx5_ibv_shared *sh)
+{
+	struct mlx5_counter_stats_mem_mng *mng;
+	uint8_t i;
+	int j;
+
+	for (i = 0; i < RTE_DIM(sh->cmng.ccont); ++i) {
+		struct mlx5_flow_counter_pool *pool;
+		uint32_t batch = !!(i % 2);
+
+		if (!sh->cmng.ccont[i].pools)
+			continue;
+		pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
+		while (pool) {
+			if (batch) {
+				if (pool->min_dcs)
+					claim_zero
+					(mlx5_devx_cmd_destroy(pool->min_dcs));
+			}
+			for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) {
+				if (pool->counters_raw[j].action)
+					claim_zero
+					(mlx5_glue->destroy_flow_action
+					       (pool->counters_raw[j].action));
+				if (!batch && pool->counters_raw[j].dcs)
+					claim_zero(mlx5_devx_cmd_destroy
+						  (pool->counters_raw[j].dcs));
+			}
+			TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool,
+				     next);
+			rte_free(pool);
+			pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
+		}
+		rte_free(sh->cmng.ccont[i].pools);
+	}
+	mng = LIST_FIRST(&sh->cmng.mem_mngs);
+	while (mng) {
+		mlx5_flow_destroy_counter_stat_mem_mng(mng);
+		mng = LIST_FIRST(&sh->cmng.mem_mngs);
+	}
+	memset(&sh->cmng, 0, sizeof(sh->cmng));
+}
+
+/**
  * Allocate shared IB device context. If there is multiport device the
  * master and representors will share this context, if there is single
  * port dedicated IB device, the context will be used by only given
@@ -260,6 +343,7 @@  struct mlx5_dev_spawn_data {
 		err = rte_errno;
 		goto error;
 	}
+	mlx5_flow_counters_mng_init(sh);
 	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
 exit:
 	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
@@ -314,6 +398,7 @@  struct mlx5_dev_spawn_data {
 	 *  Ensure there is no async event handler installed.
 	 *  Only primary process handles async device events.
 	 **/
+	mlx5_flow_counters_mng_close(sh);
 	assert(!sh->intr_cnt);
 	if (sh->intr_cnt)
 		mlx5_intr_callback_unregister
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 5af3f41..3944b5f 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -152,15 +152,23 @@  struct mlx5_stats_ctrl {
 	uint64_t imissed_base;
 };
 
-/* devx counter object */
-struct mlx5_devx_counter_set {
-	struct mlx5dv_devx_obj *obj;
-	int id; /* Flow counter ID */
+/* devX creation object */
+struct mlx5_devx_obj {
+	struct mlx5dv_devx_obj *obj; /* The DV object. */
+	int id; /* The object ID. */
+};
+
+struct mlx5_devx_mkey_attr {
+	uint64_t addr;
+	uint64_t size;
+	uint32_t umem_id;
+	uint32_t pd;
 };
 
 /* HCA attributes. */
 struct mlx5_hca_attr {
 	uint32_t eswitch_manager:1;
+	uint8_t flow_counter_bulk_alloc_bitmap;
 };
 
 /* Flow list . */
@@ -248,6 +256,87 @@  struct mlx5_drop {
 	struct mlx5_rxq_ibv *rxq; /* Verbs Rx queue. */
 };
 
+#define MLX5_COUNTERS_PER_POOL 512
+
+struct mlx5_flow_counter_pool;
+
+struct flow_counter_stats {
+	uint64_t hits;
+	uint64_t bytes;
+};
+
+/* Counters information. */
+struct mlx5_flow_counter {
+	TAILQ_ENTRY(mlx5_flow_counter) next;
+	/**< Pointer to the next flow counter structure. */
+	uint32_t shared:1; /**< Share counter ID with other flow rules. */
+	uint32_t batch: 1;
+	/**< Whether the counter was allocated by batch command. */
+	uint32_t ref_cnt:30; /**< Reference counter. */
+	uint32_t id; /**< Counter ID. */
+	union {  /**< Holds the counters for the rule. */
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
+		struct ibv_counter_set *cs;
+#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+		struct ibv_counters *cs;
+#endif
+		struct mlx5_devx_obj *dcs; /**< Counter Devx object. */
+		struct mlx5_flow_counter_pool *pool; /**< The counter pool. */
+	};
+	uint64_t hits; /**< Reset value of hits packets. */
+	uint64_t bytes; /**< Reset value of bytes. */
+	void *action; /**< Pointer to the dv action. */
+};
+
+TAILQ_HEAD(mlx5_counters, mlx5_flow_counter);
+
+/* Counter pool structure - query is in pool resolution. */
+struct mlx5_flow_counter_pool {
+	TAILQ_ENTRY(mlx5_flow_counter_pool) next;
+	struct mlx5_counters counters; /* Free counter list. */
+	struct mlx5_devx_obj *min_dcs;
+	/* The devx object of the minimum counter ID in the pool. */
+	struct mlx5_counter_stats_raw *raw; /* The counter stats memory raw. */
+	struct mlx5_flow_counter counters_raw[]; /* The counters memory. */
+};
+
+struct mlx5_counter_stats_raw;
+
+/* Memory management structure for group of counter statistics raws. */
+struct mlx5_counter_stats_mem_mng {
+	LIST_ENTRY(mlx5_counter_stats_mem_mng) next;
+	struct mlx5_counter_stats_raw *raws;
+	struct mlx5_devx_obj *dm;
+	struct mlx5dv_devx_umem *umem;
+};
+
+/* Raw memory structure for the counter statistics values of a pool. */
+struct mlx5_counter_stats_raw {
+	LIST_ENTRY(mlx5_counter_stats_raw) next;
+	int min_dcs_id;
+	struct mlx5_counter_stats_mem_mng *mem_mng;
+	volatile struct flow_counter_stats *data;
+};
+
+TAILQ_HEAD(mlx5_counter_pools, mlx5_flow_counter_pool);
+
+/* Container structure for counter pools. */
+struct mlx5_pools_container {
+	uint16_t n_valid; /* Number of valid pools. */
+	uint16_t n; /* Number of pools. */
+	struct mlx5_counter_pools pool_list; /* Counter pool list. */
+	struct mlx5_flow_counter_pool **pools; /* Counter pool array. */
+	struct mlx5_counter_stats_mem_mng *init_mem_mng;
+	/* Hold the memory management for the next allocated pools raws. */
+};
+
+/* Counter global management structure. */
+struct mlx5_flow_counter_mng {
+	struct mlx5_pools_container ccont[2];
+	struct mlx5_counters flow_counters; /* Legacy flow counter list. */
+	LIST_HEAD(mem_mngs, mlx5_counter_stats_mem_mng) mem_mngs;
+};
+
 /* Per port data of shared IB device. */
 struct mlx5_ibv_shared_port {
 	uint32_t ih_port_id;
@@ -314,6 +403,7 @@  struct mlx5_ibv_shared {
 	LIST_HEAD(jump, mlx5_flow_dv_jump_tbl_resource) jump_tbl;
 	LIST_HEAD(port_id_action_list, mlx5_flow_dv_port_id_action_resource)
 		port_id_action_list; /* List of port ID actions. */
+	struct mlx5_flow_counter_mng cmng; /* Counters management structure. */
 	/* Shared interrupt handler section. */
 	pthread_mutex_t intr_mutex; /* Interrupt config mutex. */
 	uint32_t intr_cnt; /* Interrupt handler reference counter. */
@@ -362,8 +452,6 @@  struct mlx5_priv {
 	struct mlx5_drop drop_queue; /* Flow drop queues. */
 	struct mlx5_flows flows; /* RTE Flow rules. */
 	struct mlx5_flows ctrl_flows; /* Control flow rules. */
-	LIST_HEAD(counters, mlx5_flow_counter) flow_counters;
-	/* Flow counters. */
 	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
 	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
 	LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */
@@ -584,12 +672,15 @@  int mlx5_nl_switch_info(int nl, unsigned int ifindex,
 
 /* mlx5_devx_cmds.c */
 
-int mlx5_devx_cmd_flow_counter_alloc(struct ibv_context *ctx,
-				     struct mlx5_devx_counter_set *dcx);
-int mlx5_devx_cmd_flow_counter_free(struct mlx5dv_devx_obj *obj);
-int mlx5_devx_cmd_flow_counter_query(struct mlx5_devx_counter_set *dcx,
-				     int clear,
-				     uint64_t *pkts, uint64_t *bytes);
+struct mlx5_devx_obj *mlx5_devx_cmd_flow_counter_alloc(struct ibv_context *ctx,
+						       uint32_t bulk_sz);
+int mlx5_devx_cmd_destroy(struct mlx5_devx_obj *obj);
+int mlx5_devx_cmd_flow_counter_query(struct mlx5_devx_obj *dcs,
+				     int clear, uint32_t n_counters,
+				     uint64_t *pkts, uint64_t *bytes,
+				     uint32_t mkey, void *addr);
 int mlx5_devx_cmd_query_hca_attr(struct ibv_context *ctx,
 				 struct mlx5_hca_attr *attr);
+struct mlx5_devx_obj *mlx5_devx_cmd_mkey_create(struct ibv_context *ctx,
+					     struct mlx5_devx_mkey_attr *attr);
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_devx_cmds.c b/drivers/net/mlx5/mlx5_devx_cmds.c
index e5776c4..92f2fc8 100644
--- a/drivers/net/mlx5/mlx5_devx_cmds.c
+++ b/drivers/net/mlx5/mlx5_devx_cmds.c
@@ -2,6 +2,8 @@ 
 /* Copyright 2018 Mellanox Technologies, Ltd */
 
 #include <rte_flow_driver.h>
+#include <rte_malloc.h>
+#include <unistd.h>
 
 #include "mlx5.h"
 #include "mlx5_glue.h"
@@ -14,47 +16,37 @@ 
  *   ibv contexts returned from mlx5dv_open_device.
  * @param dcs
  *   Pointer to counters properties structure to be filled by the routine.
+ * @param bulk_n_128
+ *   Bulk counter numbers in 128 counters units.
  *
  * @return
- *   0 on success, a negative value otherwise.
+ *   Pointer to counter object on success, a negative value otherwise and
+ *   rte_errno is set.
  */
-int mlx5_devx_cmd_flow_counter_alloc(struct ibv_context *ctx,
-				     struct mlx5_devx_counter_set *dcs)
+struct mlx5_devx_obj *
+mlx5_devx_cmd_flow_counter_alloc(struct ibv_context *ctx, uint32_t bulk_n_128)
 {
+	struct mlx5_devx_obj *dcs = rte_zmalloc("dcs", sizeof(*dcs), 0);
 	uint32_t in[MLX5_ST_SZ_DW(alloc_flow_counter_in)]   = {0};
 	uint32_t out[MLX5_ST_SZ_DW(alloc_flow_counter_out)] = {0};
-	int status, syndrome;
 
+	if (!dcs) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
 	MLX5_SET(alloc_flow_counter_in, in, opcode,
 		 MLX5_CMD_OP_ALLOC_FLOW_COUNTER);
+	MLX5_SET(alloc_flow_counter_in, in, flow_counter_bulk, bulk_n_128);
 	dcs->obj = mlx5_glue->devx_obj_create(ctx, in,
 					      sizeof(in), out, sizeof(out));
-	if (!dcs->obj)
-		return -errno;
-	status = MLX5_GET(query_flow_counter_out, out, status);
-	syndrome = MLX5_GET(query_flow_counter_out, out, syndrome);
-	if (status) {
-		DRV_LOG(DEBUG, "Failed to create devx counters, "
-			"status %x, syndrome %x", status, syndrome);
-		return -1;
+	if (!dcs->obj) {
+		DRV_LOG(ERR, "Can't allocate counters - error %d\n", errno);
+		rte_errno = errno;
+		rte_free(dcs);
+		return NULL;
 	}
-	dcs->id = MLX5_GET(alloc_flow_counter_out,
-			   out, flow_counter_id);
-	return 0;
-}
-
-/**
- * Free flow counters obtained via devx interface.
- *
- * @param[in] obj
- *   devx object that was obtained from mlx5_devx_cmd_fc_alloc.
- *
- * @return
- *   0 on success, a negative value otherwise.
- */
-int mlx5_devx_cmd_flow_counter_free(struct mlx5dv_devx_obj *obj)
-{
-	return mlx5_glue->devx_obj_destroy(obj);
+	dcs->id = MLX5_GET(alloc_flow_counter_out, out, flow_counter_id);
+	return dcs;
 }
 
 /**
@@ -64,49 +56,140 @@  int mlx5_devx_cmd_flow_counter_free(struct mlx5dv_devx_obj *obj)
  *   devx object that was obtained from mlx5_devx_cmd_fc_alloc.
  * @param[in] clear
  *   Whether hardware should clear the counters after the query or not.
+ * @param[in] n_counters
+ *   The counter number to read.
  *  @param pkts
  *   The number of packets that matched the flow.
  *  @param bytes
  *    The number of bytes that matched the flow.
+ *  @param mkey
+ *   The mkey key for batch query.
+ *  @param addr
+ *    The address in the mkey range for batch query.
  *
  * @return
  *   0 on success, a negative value otherwise.
  */
 int
-mlx5_devx_cmd_flow_counter_query(struct mlx5_devx_counter_set *dcs,
-				 int clear __rte_unused,
-				 uint64_t *pkts, uint64_t *bytes)
+mlx5_devx_cmd_flow_counter_query(struct mlx5_devx_obj *dcs, int clear,
+				 uint32_t n_counters, uint64_t *pkts,
+				 uint64_t *bytes, uint32_t mkey, void *addr)
 {
-	uint32_t out[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
-		MLX5_ST_SZ_BYTES(traffic_counter)]   = {0};
+	int out_len = MLX5_ST_SZ_BYTES(query_flow_counter_out) +
+			MLX5_ST_SZ_BYTES(traffic_counter);
+	uint32_t out[out_len];
 	uint32_t in[MLX5_ST_SZ_DW(query_flow_counter_in)] = {0};
 	void *stats;
-	int status, syndrome, rc;
+	int rc;
 
 	MLX5_SET(query_flow_counter_in, in, opcode,
 		 MLX5_CMD_OP_QUERY_FLOW_COUNTER);
 	MLX5_SET(query_flow_counter_in, in, op_mod, 0);
 	MLX5_SET(query_flow_counter_in, in, flow_counter_id, dcs->id);
-	rc = mlx5_glue->devx_obj_query(dcs->obj,
-				       in, sizeof(in), out, sizeof(out));
-	if (rc)
-		return rc;
-	status = MLX5_GET(query_flow_counter_out, out, status);
-	syndrome = MLX5_GET(query_flow_counter_out, out, syndrome);
-	if (status) {
-		DRV_LOG(DEBUG, "Failed to query devx counters, "
-			"id %d, status %x, syndrome = %x",
-			status, syndrome, dcs->id);
-		return -1;
+	MLX5_SET(query_flow_counter_in, in, clear, !!clear);
+
+	if (n_counters) {
+		MLX5_SET(query_flow_counter_in, in, num_of_counters,
+			 n_counters);
+		MLX5_SET(query_flow_counter_in, in, dump_to_memory, 1);
+		MLX5_SET(query_flow_counter_in, in, mkey, mkey);
+		MLX5_SET64(query_flow_counter_in, in, address,
+			   (uint64_t)(uintptr_t)addr);
+	}
+	rc = mlx5_glue->devx_obj_query(dcs->obj, in, sizeof(in), out, out_len);
+	if (rc) {
+		DRV_LOG(ERR, "Failed to query devx counters with rc %d\n ", rc);
+		rte_errno = rc;
+		return -rc;
+	}
+	if (!n_counters) {
+		stats = MLX5_ADDR_OF(query_flow_counter_out,
+				     out, flow_statistics);
+		*pkts = MLX5_GET64(traffic_counter, stats, packets);
+		*bytes = MLX5_GET64(traffic_counter, stats, octets);
 	}
-	stats = MLX5_ADDR_OF(query_flow_counter_out,
-			     out, flow_statistics);
-	*pkts = MLX5_GET64(traffic_counter, stats, packets);
-	*bytes = MLX5_GET64(traffic_counter, stats, octets);
 	return 0;
 }
 
 /**
+ * Create a new mkey.
+ *
+ * @param[in] ctx
+ *   ibv contexts returned from mlx5dv_open_device.
+ * @param[in] attr
+ *   Attributes of the requested mkey.
+ *
+ * @return
+ *   Pointer to Devx mkey on success, a negative value otherwise and rte_errno
+ *   is set.
+ */
+struct mlx5_devx_obj *
+mlx5_devx_cmd_mkey_create(struct ibv_context *ctx,
+			  struct mlx5_devx_mkey_attr *attr)
+{
+	uint32_t in[MLX5_ST_SZ_DW(create_mkey_in)] = {0};
+	uint32_t out[MLX5_ST_SZ_DW(create_mkey_out)] = {0};
+	void *mkc;
+	struct mlx5_devx_obj *mkey = rte_zmalloc("mkey", sizeof(*mkey), 0);
+	size_t pgsize;
+	uint32_t translation_size;
+
+	if (!mkey) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+	pgsize = sysconf(_SC_PAGESIZE);
+	translation_size = (RTE_ALIGN(attr->size, pgsize) * 8) / 16;
+	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
+	MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
+		 translation_size);
+	MLX5_SET(create_mkey_in, in, mkey_umem_id, attr->umem_id);
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, lw, 0x1);
+	MLX5_SET(mkc, mkc, lr, 0x1);
+	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+	MLX5_SET(mkc, mkc, pd, attr->pd);
+	MLX5_SET(mkc, mkc, mkey_7_0, attr->umem_id & 0xFF);
+	MLX5_SET(mkc, mkc, translations_octword_size, translation_size);
+	MLX5_SET64(mkc, mkc, start_addr, attr->addr);
+	MLX5_SET64(mkc, mkc, len, attr->size);
+	MLX5_SET(mkc, mkc, log_page_size, rte_log2_u32(pgsize));
+	mkey->obj = mlx5_glue->devx_obj_create(ctx, in, sizeof(in), out,
+					       sizeof(out));
+	if (!mkey->obj) {
+		DRV_LOG(ERR, "Can't create mkey - error %d\n", errno);
+		rte_errno = errno;
+		rte_free(mkey);
+		return NULL;
+	}
+	mkey->id = MLX5_GET(create_mkey_out, out, mkey_index);
+	mkey->id = (mkey->id << 8) | (attr->umem_id & 0xFF);
+	return mkey;
+}
+
+/**
+ * Destroy any object allocated by a Devx API.
+ *
+ * @param[in] obj
+ *   Pointer to a general object.
+ *
+ * @return
+ *   0 on success, a negative value otherwise.
+ */
+int
+mlx5_devx_cmd_destroy(struct mlx5_devx_obj *obj)
+{
+	int ret;
+
+	if (!obj)
+		return 0;
+	ret =  mlx5_glue->devx_obj_destroy(obj->obj);
+	rte_free(obj);
+	return ret;
+}
+
+/**
  * Query HCA attributes.
  * Using those attributes we can check on run time if the device
  * is having the required capabilities.
@@ -146,6 +229,8 @@  int mlx5_devx_cmd_flow_counter_free(struct mlx5dv_devx_obj *obj)
 		return -1;
 	}
 	hcattr = MLX5_ADDR_OF(query_hca_cap_out, out, capability);
+	attr->flow_counter_bulk_alloc_bitmap =
+			MLX5_GET(cmd_hca_cap, hcattr, flow_counter_bulk_alloc);
 	attr->eswitch_manager = MLX5_GET(cmd_hca_cap, hcattr, eswitch_manager);
 	return 0;
 }
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 3d7fcf7..fbd09d0 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -354,25 +354,6 @@  struct mlx5_flow {
 	};
 };
 
-/* Counters information. */
-struct mlx5_flow_counter {
-	LIST_ENTRY(mlx5_flow_counter) next; /**< Pointer to the next counter. */
-	uint32_t shared:1; /**< Share counter ID with other flow rules. */
-	uint32_t ref_cnt:31; /**< Reference counter. */
-	uint32_t id; /**< Counter ID. */
-	union {  /**< Holds the counters for the rule. */
-#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
-		struct ibv_counter_set *cs;
-#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
-		struct ibv_counters *cs;
-#endif
-		struct mlx5_devx_counter_set *dcs;
-	};
-	uint64_t hits; /**< Number of packets matched by the rule. */
-	uint64_t bytes; /**< Number of bytes matched by the rule. */
-	void *action; /**< Pointer to the dv action. */
-};
-
 /* Flow structure. */
 struct rte_flow {
 	TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 9cc09e7..9654c3b 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -6,6 +6,7 @@ 
 #include <stdalign.h>
 #include <stdint.h>
 #include <string.h>
+#include <unistd.h>
 
 /* Verbs header. */
 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
@@ -2113,8 +2114,344 @@  struct field_modify_info modify_tcp[] = {
 	return 0;
 }
 
+#define MLX5_CNT_CONTAINER_SIZE 64
+#define MLX5_CNT_CONTAINER(priv, batch) (&(priv)->sh->cmng.ccont[batch])
+
+/**
+ * Get a pool by a counter.
+ *
+ * @param[in] cnt
+ *   Pointer to the counter.
+ *
+ * @return
+ *   The counter pool.
+ */
+static struct mlx5_flow_counter_pool *
+flow_dv_counter_pool_get(struct mlx5_flow_counter *cnt)
+{
+	if (!cnt->batch) {
+		cnt -= cnt->dcs->id % MLX5_COUNTERS_PER_POOL;
+		return (struct mlx5_flow_counter_pool *)cnt - 1;
+	}
+	return cnt->pool;
+}
+
+/**
+ * Get a pool by devx counter ID.
+ *
+ * @param[in] cont
+ *   Pointer to the counter container.
+ * @param[in] id
+ *   The counter devx ID.
+ *
+ * @return
+ *   The counter pool pointer if exists, NULL otherwise,
+ */
+static struct mlx5_flow_counter_pool *
+flow_dv_find_pool_by_id(struct mlx5_pools_container *cont, int id)
+{
+	struct mlx5_flow_counter_pool *pool;
+
+	TAILQ_FOREACH(pool, &cont->pool_list, next) {
+		int base = (pool->min_dcs->id / MLX5_COUNTERS_PER_POOL) *
+				MLX5_COUNTERS_PER_POOL;
+
+		if (id >= base && id < base + MLX5_COUNTERS_PER_POOL)
+			return pool;
+	};
+	return NULL;
+}
+
+/**
+ * Allocate a new memory for the counter values wrapped by all the needed
+ * management.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] raws_n
+ *   The raw memory areas - each one for MLX5_COUNTERS_PER_POOL counters.
+ *
+ * @return
+ *   The new memory management pointer on success, otherwise NULL and rte_errno
+ *   is set.
+ */
+static struct mlx5_counter_stats_mem_mng *
+flow_dv_create_counter_stat_mem_mng(struct rte_eth_dev *dev, int raws_n)
+{
+	struct mlx5_ibv_shared *sh = ((struct mlx5_priv *)
+					(dev->data->dev_private))->sh;
+	struct mlx5dv_pd dv_pd;
+	struct mlx5dv_obj dv_obj;
+	struct mlx5_devx_mkey_attr mkey_attr;
+	struct mlx5_counter_stats_mem_mng *mem_mng;
+	volatile struct flow_counter_stats *raw_data;
+	int size = (sizeof(struct flow_counter_stats) *
+			MLX5_COUNTERS_PER_POOL +
+			sizeof(struct mlx5_counter_stats_raw)) * raws_n +
+			sizeof(struct mlx5_counter_stats_mem_mng);
+	uint8_t *mem = rte_calloc(__func__, 1, size, sysconf(_SC_PAGESIZE));
+	int i;
+
+	if (!mem) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+	mem_mng = (struct mlx5_counter_stats_mem_mng *)(mem + size) - 1;
+	size = sizeof(*raw_data) * MLX5_COUNTERS_PER_POOL * raws_n;
+	mem_mng->umem = mlx5_glue->devx_umem_reg(sh->ctx, mem, size,
+						 IBV_ACCESS_LOCAL_WRITE);
+	if (!mem_mng->umem) {
+		rte_errno = errno;
+		rte_free(mem);
+		return NULL;
+	}
+	dv_obj.pd.in = sh->pd;
+	dv_obj.pd.out = &dv_pd;
+	mlx5_glue->dv_init_obj(&dv_obj, MLX5DV_OBJ_PD);
+	mkey_attr.addr = (uintptr_t)mem;
+	mkey_attr.size = size;
+	mkey_attr.umem_id = mem_mng->umem->umem_id;
+	mkey_attr.pd = dv_pd.pdn;
+	mem_mng->dm = mlx5_devx_cmd_mkey_create(sh->ctx, &mkey_attr);
+	if (!mem_mng->dm) {
+		mlx5_glue->devx_umem_dereg(mem_mng->umem);
+		rte_errno = errno;
+		rte_free(mem);
+		return NULL;
+	}
+	mem_mng->raws = (struct mlx5_counter_stats_raw *)(mem + size);
+	raw_data = (volatile struct flow_counter_stats *)mem;
+	for (i = 0; i < raws_n; ++i) {
+		mem_mng->raws[i].mem_mng = mem_mng;
+		mem_mng->raws[i].data = raw_data + i * MLX5_COUNTERS_PER_POOL;
+	}
+	LIST_INSERT_HEAD(&sh->cmng.mem_mngs, mem_mng, next);
+	return mem_mng;
+}
+
+/**
+ * Prepare a counter container.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] batch
+ *   Whether the pool is for counter that was allocated by batch command.
+ *
+ * @return
+ *   The container pointer on success, otherwise NULL and rte_errno is set.
+ */
+static struct mlx5_pools_container *
+flow_dv_container_prepare(struct rte_eth_dev *dev, uint32_t batch)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv, batch);
+	struct mlx5_counter_stats_mem_mng *mem_mng;
+	uint32_t size = MLX5_CNT_CONTAINER_SIZE;
+	uint32_t mem_size = sizeof(struct mlx5_flow_counter_pool *) * size;
+
+	cont->pools = rte_calloc(__func__, 1, mem_size, 0);
+	if (!cont->pools) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+	mem_mng = flow_dv_create_counter_stat_mem_mng(dev, size);
+	if (!mem_mng) {
+		rte_free(cont->pools);
+		return NULL;
+	}
+	cont->n = size;
+	TAILQ_INIT(&cont->pool_list);
+	cont->init_mem_mng = mem_mng;
+	return cont;
+}
+
+/**
+ * Query a devx flow counter.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] cnt
+ *   Pointer to the flow counter.
+ * @param[out] pkts
+ *   The statistics value of packets.
+ * @param[out] bytes
+ *   The statistics value of bytes.
+ *
+ * @return
+ *   0 on success, otherwise a negative errno value and rte_errno is set.
+ */
+static inline int
+_flow_dv_query_count(struct rte_eth_dev *dev __rte_unused,
+		     struct mlx5_flow_counter *cnt, uint64_t *pkts,
+		     uint64_t *bytes)
+{
+	struct mlx5_flow_counter_pool *pool =
+			flow_dv_counter_pool_get(cnt);
+	uint16_t offset = pool->min_dcs->id % MLX5_COUNTERS_PER_POOL;
+	int ret = mlx5_devx_cmd_flow_counter_query
+		(pool->min_dcs, 0, MLX5_COUNTERS_PER_POOL - offset, NULL,
+		 NULL, pool->raw->mem_mng->dm->id,
+		 (void *)(uintptr_t)(pool->raw->data +
+		 offset));
+
+	if (ret) {
+		DRV_LOG(ERR, "Failed to trigger synchronous"
+			" query for dcs ID %d\n",
+			pool->min_dcs->id);
+		return ret;
+	}
+	offset = cnt - &pool->counters_raw[0];
+	*pkts = rte_be_to_cpu_64(pool->raw->data[offset].hits);
+	*bytes = rte_be_to_cpu_64(pool->raw->data[offset].bytes);
+	return 0;
+}
+
+/**
+ * Create and initialize a new counter pool.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[out] dcs
+ *   The devX counter handle.
+ * @param[in] batch
+ *   Whether the pool is for counter that was allocated by batch command.
+ *
+ * @return
+ *   A new pool pointer on success, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_flow_counter_pool *
+flow_dv_pool_create(struct rte_eth_dev *dev, struct mlx5_devx_obj *dcs,
+		    uint32_t batch)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_counter_pool *pool;
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv, batch);
+	uint32_t size;
+
+	if (!cont->n) {
+		cont = flow_dv_container_prepare(dev, batch);
+		if (!cont)
+			return NULL;
+	} else if (cont->n == cont->n_valid) {
+		DRV_LOG(ERR, "No space in container to allocate a new pool\n");
+		rte_errno = ENOSPC;
+		return NULL;
+	}
+	size = sizeof(*pool) + MLX5_COUNTERS_PER_POOL *
+			sizeof(struct mlx5_flow_counter);
+	pool = rte_calloc(__func__, 1, size, 0);
+	if (!pool) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+	pool->min_dcs = dcs;
+	pool->raw = cont->init_mem_mng->raws + cont->n_valid;
+	TAILQ_INIT(&pool->counters);
+	TAILQ_INSERT_TAIL(&cont->pool_list, pool, next);
+	cont->pools[cont->n_valid] = pool;
+	cont->n_valid++;
+	return pool;
+}
+
 /**
- * Get or create a flow counter.
+ * Prepare a new counter and/or a new counter pool.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[out] cnt_free
+ *   Where to put the pointer of a new counter.
+ * @param[in] batch
+ *   Whether the pool is for counter that was allocated by batch command.
+ *
+ * @return
+ *   The free counter pool pointer and @p cnt_free is set on success,
+ *   NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_flow_counter_pool *
+flow_dv_counter_pool_prepare(struct rte_eth_dev *dev,
+			     struct mlx5_flow_counter **cnt_free,
+			     uint32_t batch)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_counter_pool *pool;
+	struct mlx5_devx_obj *dcs = NULL;
+	struct mlx5_flow_counter *cnt;
+	uint32_t i;
+
+	if (!batch) {
+		/* bulk_bitmap must be 0 for single counter allocation. */
+		dcs = mlx5_devx_cmd_flow_counter_alloc(priv->sh->ctx, 0);
+		if (!dcs)
+			return NULL;
+		pool = flow_dv_find_pool_by_id(MLX5_CNT_CONTAINER(priv, batch),
+					       dcs->id);
+		if (!pool) {
+			pool = flow_dv_pool_create(dev, dcs, batch);
+			if (!pool) {
+				mlx5_devx_cmd_destroy(dcs);
+				return NULL;
+			}
+		} else if (dcs->id < pool->min_dcs->id) {
+			pool->min_dcs->id = dcs->id;
+		}
+		cnt = &pool->counters_raw[dcs->id % MLX5_COUNTERS_PER_POOL];
+		TAILQ_INSERT_HEAD(&pool->counters, cnt, next);
+		cnt->dcs = dcs;
+		*cnt_free = cnt;
+		return pool;
+	}
+	/* bulk_bitmap is in 128 counters units. */
+	if (priv->config.hca_attr.flow_counter_bulk_alloc_bitmap & 0x4)
+		dcs = mlx5_devx_cmd_flow_counter_alloc(priv->sh->ctx, 0x4);
+	if (!dcs) {
+		rte_errno = ENODATA;
+		return NULL;
+	}
+	pool = flow_dv_pool_create(dev, dcs, batch);
+	if (!pool) {
+		mlx5_devx_cmd_destroy(dcs);
+		return NULL;
+	}
+	for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) {
+		cnt = &pool->counters_raw[i];
+		cnt->pool = pool;
+		TAILQ_INSERT_HEAD(&pool->counters, cnt, next);
+	}
+	*cnt_free = &pool->counters_raw[0];
+	return pool;
+}
+
+/**
+ * Search for existed shared counter.
+ *
+ * @param[in] cont
+ *   Pointer to the relevant counter pool container.
+ * @param[in] id
+ *   The shared counter ID to search.
+ *
+ * @return
+ *   NULL if not existed, otherwise pointer to the shared counter.
+ */
+static struct mlx5_flow_counter *
+flow_dv_counter_shared_search(struct mlx5_pools_container *cont,
+			      uint32_t id)
+{
+	static struct mlx5_flow_counter *cnt;
+	struct mlx5_flow_counter_pool *pool;
+	int i;
+
+	TAILQ_FOREACH(pool, &cont->pool_list, next) {
+		for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) {
+			cnt = &pool->counters_raw[i];
+			if (cnt->ref_cnt && cnt->shared && cnt->id == id)
+				return cnt;
+		}
+	}
+	return NULL;
+}
+
+/**
+ * Allocate a flow counter.
  *
  * @param[in] dev
  *   Pointer to the Ethernet device structure.
@@ -2122,80 +2459,110 @@  struct field_modify_info modify_tcp[] = {
  *   Indicate if this counter is shared with other flows.
  * @param[in] id
  *   Counter identifier.
+ * @param[in] group
+ *   Counter flow group.
  *
  * @return
  *   pointer to flow counter on success, NULL otherwise and rte_errno is set.
  */
 static struct mlx5_flow_counter *
-flow_dv_counter_new(struct rte_eth_dev *dev, uint32_t shared, uint32_t id)
+flow_dv_counter_alloc(struct rte_eth_dev *dev, uint32_t shared, uint32_t id,
+		      uint16_t group)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_flow_counter *cnt = NULL;
-	struct mlx5_devx_counter_set *dcs = NULL;
-	int ret;
+	struct mlx5_flow_counter_pool *pool = NULL;
+	struct mlx5_flow_counter *cnt_free = NULL;
+	/*
+	 * Currently group 0 flow counter cannot be assigned to a flow if it is
+	 * not the first one in the batch counter allocation, so it is better
+	 * to allocate counters one by one for these flows in a separate
+	 * container.
+	 * A counter can be shared between different groups so need to take
+	 * shared counters from the single container.
+	 */
+	uint32_t batch = (group && !shared) ? 1 : 0;
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv, batch);
 
 	if (!priv->config.devx) {
-		ret = -ENOTSUP;
-		goto error_exit;
+		rte_errno = ENOTSUP;
+		return NULL;
 	}
 	if (shared) {
-		LIST_FOREACH(cnt, &priv->flow_counters, next) {
-			if (cnt->shared && cnt->id == id) {
-				cnt->ref_cnt++;
-				return cnt;
+		cnt_free = flow_dv_counter_shared_search(cont, id);
+		if (cnt_free) {
+			if (cnt_free->ref_cnt + 1 == 0) {
+				rte_errno = E2BIG;
+				return NULL;
 			}
+			cnt_free->ref_cnt++;
+			return cnt_free;
 		}
 	}
-	cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
-	dcs = rte_calloc(__func__, 1, sizeof(*dcs), 0);
-	if (!dcs || !cnt) {
-		ret = -ENOMEM;
-		goto error_exit;
+	/* Pools which has a free counters are in the start. */
+	pool = TAILQ_FIRST(&cont->pool_list);
+	if (pool)
+		cnt_free = TAILQ_FIRST(&pool->counters);
+	if (!cnt_free) {
+		pool = flow_dv_counter_pool_prepare(dev, &cnt_free, batch);
+		if (!pool)
+			return NULL;
 	}
-	ret = mlx5_devx_cmd_flow_counter_alloc(priv->sh->ctx, dcs);
-	if (ret)
-		goto error_exit;
-	struct mlx5_flow_counter tmpl = {
-		.shared = shared,
-		.ref_cnt = 1,
-		.id = id,
-		.dcs = dcs,
-	};
-	tmpl.action = mlx5_glue->dv_create_flow_action_counter(dcs->obj, 0);
-	if (!tmpl.action) {
-		ret = errno;
-		goto error_exit;
+	cnt_free->batch = batch;
+	/* Create a DV counter action only in the first time usage. */
+	if (!cnt_free->action) {
+		uint16_t offset;
+		struct mlx5_devx_obj *dcs;
+
+		if (batch) {
+			offset = cnt_free - &pool->counters_raw[0];
+			dcs = pool->min_dcs;
+		} else {
+			offset = 0;
+			dcs = cnt_free->dcs;
+		}
+		cnt_free->action = mlx5_glue->dv_create_flow_action_counter
+					(dcs->obj, offset);
+		if (!cnt_free->action) {
+			rte_errno = errno;
+			return NULL;
+		}
 	}
-	*cnt = tmpl;
-	LIST_INSERT_HEAD(&priv->flow_counters, cnt, next);
-	return cnt;
-error_exit:
-	rte_free(cnt);
-	rte_free(dcs);
-	rte_errno = -ret;
-	return NULL;
+	/* Update the counter reset values. */
+	if (_flow_dv_query_count(dev, cnt_free, &cnt_free->hits,
+				 &cnt_free->bytes))
+		return NULL;
+	cnt_free->shared = shared;
+	cnt_free->ref_cnt = 1;
+	cnt_free->id = id;
+	TAILQ_REMOVE(&pool->counters, cnt_free, next);
+	if (TAILQ_EMPTY(&pool->counters)) {
+		/* Move the pool to the end of the container pool list. */
+		TAILQ_REMOVE(&cont->pool_list, pool, next);
+		TAILQ_INSERT_TAIL(&cont->pool_list, pool, next);
+	}
+	return cnt_free;
 }
 
 /**
  * Release a flow counter.
  *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
  * @param[in] counter
  *   Pointer to the counter handler.
  */
 static void
-flow_dv_counter_release(struct mlx5_flow_counter *counter)
+flow_dv_counter_release(struct rte_eth_dev *dev __rte_unused,
+			struct mlx5_flow_counter *counter)
 {
-	int ret;
-
 	if (!counter)
 		return;
 	if (--counter->ref_cnt == 0) {
-		ret = mlx5_devx_cmd_flow_counter_free(counter->dcs->obj);
-		if (ret)
-			DRV_LOG(ERR, "Failed to free devx counters, %d", ret);
-		LIST_REMOVE(counter, next);
-		rte_free(counter->dcs);
-		rte_free(counter);
+		struct mlx5_flow_counter_pool *pool =
+				flow_dv_counter_pool_get(counter);
+
+		/* Put the counter in the end - the earliest one. */
+		TAILQ_INSERT_TAIL(&pool->counters, counter, next);
 	}
 }
 
@@ -4103,8 +4470,10 @@  struct field_modify_info modify_tcp[] = {
 				rte_errno = ENOTSUP;
 				goto cnt_err;
 			}
-			flow->counter = flow_dv_counter_new(dev, count->shared,
-							    count->id);
+			flow->counter = flow_dv_counter_alloc(dev,
+							      count->shared,
+							      count->id,
+							      attr->group);
 			if (flow->counter == NULL)
 				goto cnt_err;
 			dev_flow->dv.actions[actions_n++] =
@@ -4770,7 +5139,7 @@  struct field_modify_info modify_tcp[] = {
 		return;
 	flow_dv_remove(dev, flow);
 	if (flow->counter) {
-		flow_dv_counter_release(flow->counter);
+		flow_dv_counter_release(dev, flow->counter);
 		flow->counter = NULL;
 	}
 	if (flow->tag_resource) {
@@ -4815,9 +5184,6 @@  struct field_modify_info modify_tcp[] = {
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct rte_flow_query_count *qc = data;
-	uint64_t pkts = 0;
-	uint64_t bytes = 0;
-	int err;
 
 	if (!priv->config.devx)
 		return rte_flow_error_set(error, ENOTSUP,
@@ -4825,15 +5191,14 @@  struct field_modify_info modify_tcp[] = {
 					  NULL,
 					  "counters are not supported");
 	if (flow->counter) {
-		err = mlx5_devx_cmd_flow_counter_query
-						(flow->counter->dcs,
-						 qc->reset, &pkts, &bytes);
+		uint64_t pkts, bytes;
+		int err = _flow_dv_query_count(dev, flow->counter, &pkts,
+					       &bytes);
+
 		if (err)
-			return rte_flow_error_set
-				(error, err,
-				 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-				 NULL,
-				 "cannot read counters");
+			return rte_flow_error_set(error, -err,
+					RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					NULL, "cannot read counters");
 		qc->hits_set = 1;
 		qc->bytes_set = 1;
 		qc->hits = pkts - flow->counter->hits;
diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index 2f4c80c..b3395b8 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -124,7 +124,7 @@ 
 	int ret;
 
 	if (shared) {
-		LIST_FOREACH(cnt, &priv->flow_counters, next) {
+		TAILQ_FOREACH(cnt, &priv->sh->cmng.flow_counters, next) {
 			if (cnt->shared && cnt->id == id) {
 				cnt->ref_cnt++;
 				return cnt;
@@ -144,7 +144,7 @@ 
 	/* Create counter with Verbs. */
 	ret = flow_verbs_counter_create(dev, cnt);
 	if (!ret) {
-		LIST_INSERT_HEAD(&priv->flow_counters, cnt, next);
+		TAILQ_INSERT_HEAD(&priv->sh->cmng.flow_counters, cnt, next);
 		return cnt;
 	}
 	/* Some error occurred in Verbs library. */
@@ -156,19 +156,24 @@ 
 /**
  * Release a flow counter.
  *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
  * @param[in] counter
  *   Pointer to the counter handler.
  */
 static void
-flow_verbs_counter_release(struct mlx5_flow_counter *counter)
+flow_verbs_counter_release(struct rte_eth_dev *dev,
+			   struct mlx5_flow_counter *counter)
 {
+	struct mlx5_priv *priv = dev->data->dev_private;
+
 	if (--counter->ref_cnt == 0) {
 #if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
 		claim_zero(mlx5_glue->destroy_counter_set(counter->cs));
 #elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
 		claim_zero(mlx5_glue->destroy_counters(counter->cs));
 #endif
-		LIST_REMOVE(counter, next);
+		TAILQ_REMOVE(&priv->sh->cmng.flow_counters, counter, next);
 		rte_free(counter);
 	}
 }
@@ -1612,7 +1617,7 @@ 
 		rte_free(dev_flow);
 	}
 	if (flow->counter) {
-		flow_verbs_counter_release(flow->counter);
+		flow_verbs_counter_release(dev, flow->counter);
 		flow->counter = NULL;
 	}
 }
diff --git a/drivers/net/mlx5/mlx5_glue.c b/drivers/net/mlx5/mlx5_glue.c
index d038373..ba5fd06 100644
--- a/drivers/net/mlx5/mlx5_glue.c
+++ b/drivers/net/mlx5/mlx5_glue.c
@@ -849,6 +849,33 @@ 
 #endif
 }
 
+static struct mlx5dv_devx_umem *
+mlx5_glue_devx_umem_reg(struct ibv_context *context, void *addr, size_t size,
+			uint32_t access)
+{
+#ifdef HAVE_IBV_DEVX_OBJ
+	return mlx5dv_devx_umem_reg(context, addr, size, access);
+#else
+	(void)context;
+	(void)addr;
+	(void)size;
+	(void)access;
+	errno = -ENOTSUP;
+	return NULL;
+#endif
+}
+
+static int
+mlx5_glue_devx_umem_dereg(struct mlx5dv_devx_umem *dv_devx_umem)
+{
+#ifdef HAVE_IBV_DEVX_OBJ
+	return mlx5dv_devx_umem_dereg(dv_devx_umem);
+#else
+	(void)dv_devx_umem;
+	return -ENOTSUP;
+#endif
+}
+
 alignas(RTE_CACHE_LINE_SIZE)
 const struct mlx5_glue *mlx5_glue = &(const struct mlx5_glue){
 	.version = MLX5_GLUE_VERSION,
@@ -930,4 +957,6 @@ 
 	.devx_obj_query = mlx5_glue_devx_obj_query,
 	.devx_obj_modify = mlx5_glue_devx_obj_modify,
 	.devx_general_cmd = mlx5_glue_devx_general_cmd,
+	.devx_umem_reg = mlx5_glue_devx_umem_reg,
+	.devx_umem_dereg = mlx5_glue_devx_umem_dereg,
 };
diff --git a/drivers/net/mlx5/mlx5_glue.h b/drivers/net/mlx5/mlx5_glue.h
index 433c9ed..18b1ce6 100644
--- a/drivers/net/mlx5/mlx5_glue.h
+++ b/drivers/net/mlx5/mlx5_glue.h
@@ -61,6 +61,7 @@ 
 
 #ifndef HAVE_IBV_DEVX_OBJ
 struct mlx5dv_devx_obj;
+struct mlx5dv_devx_umem;
 #endif
 
 #ifndef HAVE_MLX5DV_DR
@@ -209,6 +210,10 @@  struct mlx5_glue {
 	int (*devx_general_cmd)(struct ibv_context *context,
 				const void *in, size_t inlen,
 				void *out, size_t outlen);
+	struct mlx5dv_devx_umem *(*devx_umem_reg)(struct ibv_context *context,
+						  void *addr, size_t size,
+						  uint32_t access);
+	int (*devx_umem_dereg)(struct mlx5dv_devx_umem *dv_devx_umem);
 };
 
 const struct mlx5_glue *mlx5_glue;
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 7482383..e2e538d 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -415,6 +415,14 @@  struct mlx5_modification_cmd {
 				 (((_v) & __mlx5_mask(typ, fld)) << \
 				   __mlx5_dw_bit_off(typ, fld))); \
 	} while (0)
+
+#define MLX5_SET64(typ, p, fld, v) \
+	do { \
+		assert(__mlx5_bit_sz(typ, fld) == 64); \
+		*((__be64 *)(p) + __mlx5_64_off(typ, fld)) = \
+			rte_cpu_to_be_64(v); \
+	} while (0)
+
 #define MLX5_GET(typ, p, fld) \
 	((rte_be_to_cpu_32(*((__be32 *)(p) +\
 	__mlx5_dw_off(typ, fld))) >> __mlx5_dw_bit_off(typ, fld)) & \
@@ -552,10 +560,15 @@  enum {
 
 enum {
 	MLX5_CMD_OP_QUERY_HCA_CAP = 0x100,
+	MLX5_CMD_OP_CREATE_MKEY = 0x200,
 	MLX5_CMD_OP_ALLOC_FLOW_COUNTER = 0x939,
 	MLX5_CMD_OP_QUERY_FLOW_COUNTER = 0x93b,
 };
 
+enum {
+	MLX5_MKC_ACCESS_MODE_MTT   = 0x1,
+};
+
 /* Flow counters. */
 struct mlx5_ifc_alloc_flow_counter_out_bits {
 	u8         status[0x8];
@@ -570,7 +583,9 @@  struct mlx5_ifc_alloc_flow_counter_in_bits {
 	u8         reserved_at_10[0x10];
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
-	u8         reserved_at_40[0x40];
+	u8         flow_counter_id[0x20];
+	u8         reserved_at_40[0x18];
+	u8         flow_counter_bulk[0x8];
 };
 
 struct mlx5_ifc_dealloc_flow_counter_out_bits {
@@ -607,13 +622,102 @@  struct mlx5_ifc_query_flow_counter_in_bits {
 	u8         reserved_at_10[0x10];
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
-	u8         reserved_at_40[0x80];
+	u8         reserved_at_40[0x20];
+	u8         mkey[0x20];
+	u8         address[0x40];
 	u8         clear[0x1];
-	u8         reserved_at_c1[0xf];
-	u8         num_of_counters[0x10];
+	u8         dump_to_memory[0x1];
+	u8         num_of_counters[0x1e];
 	u8         flow_counter_id[0x20];
 };
 
+struct mlx5_ifc_mkc_bits {
+	u8         reserved_at_0[0x1];
+	u8         free[0x1];
+	u8         reserved_at_2[0x1];
+	u8         access_mode_4_2[0x3];
+	u8         reserved_at_6[0x7];
+	u8         relaxed_ordering_write[0x1];
+	u8         reserved_at_e[0x1];
+	u8         small_fence_on_rdma_read_response[0x1];
+	u8         umr_en[0x1];
+	u8         a[0x1];
+	u8         rw[0x1];
+	u8         rr[0x1];
+	u8         lw[0x1];
+	u8         lr[0x1];
+	u8         access_mode_1_0[0x2];
+	u8         reserved_at_18[0x8];
+
+	u8         qpn[0x18];
+	u8         mkey_7_0[0x8];
+
+	u8         reserved_at_40[0x20];
+
+	u8         length64[0x1];
+	u8         bsf_en[0x1];
+	u8         sync_umr[0x1];
+	u8         reserved_at_63[0x2];
+	u8         expected_sigerr_count[0x1];
+	u8         reserved_at_66[0x1];
+	u8         en_rinval[0x1];
+	u8         pd[0x18];
+
+	u8         start_addr[0x40];
+
+	u8         len[0x40];
+
+	u8         bsf_octword_size[0x20];
+
+	u8         reserved_at_120[0x80];
+
+	u8         translations_octword_size[0x20];
+
+	u8         reserved_at_1c0[0x1b];
+	u8         log_page_size[0x5];
+
+	u8         reserved_at_1e0[0x20];
+};
+
+struct mlx5_ifc_create_mkey_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         mkey_index[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_mkey_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x20];
+
+	u8         pg_access[0x1];
+	u8         reserved_at_61[0x1f];
+
+	struct mlx5_ifc_mkc_bits memory_key_mkey_entry;
+
+	u8         reserved_at_280[0x80];
+
+	u8         translations_octword_actual_size[0x20];
+
+	u8         mkey_umem_id[0x20];
+
+	u8         mkey_umem_offset[0x40];
+
+	u8         reserved_at_380[0x500];
+
+	u8         klm_pas_mtt[][0x20];
+};
+
 enum {
 	MLX5_GET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0 << 1,
 	MLX5_GET_HCA_CAP_OP_MOD_QOS_CAP        = 0xc << 1,