@@ -2655,3 +2655,74 @@ Destroy GENEVE TLV parser for specific port::
This command doesn't destroy the global list,
For releasing options, ``flush`` command should be used.
+
+
+Notes for flow counters
+-----------------------
+
+mlx5 PMD supports COUNT flow action, which provides an ability
+to count packets (and bytes) matched against a given flow rule.
+This section describes the high level overview of how this support is
+implemented and limitations.
+
+HW Steering flow engine
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Flow counters are allocated from HW in bulks.
+A set of bulks forms a flow counter pool managed by PMD.
+When flow counters are queried from HW,
+each counter is identified by an offset in a given bulk.
+Querying HW flow counter requires sending a request to HW,
+which will request a read of counter values for given offsets.
+HW will asynchronously provide these values through a DMA write.
+
+In order to optimize HW to SW communication,
+these requests are handled in a separate counter service thread
+spawned by mlx5 PMD.
+This service thread will refresh the counter values stored in memory,
+in cycles, each spanning ``svc_cycle_time`` milliseconds.
+By default, ``svc_cycle_time`` is set to 500.
+When applications query the COUNT flow action,
+PMD returns the values stored in host memory.
+
+mlx5 PMD manages 3 global rings of allocated counter offsets:
+
+- ``free`` ring - Counters which were not used at all.
+- ``wait_reset`` ring - Counters which were used in some flow rules,
+ but were recently freed (flow rule was destroyed or an indirect action
+ was destroyed).
+ Since count value might have changed between last counter service
+ thread cycle and the moment it was freed, the value in host memory
+ might be stale.
+ During next service thread cycle, such counters will be moved
+ to ``reuse`` ring.
+- ``reuse`` ring - Counters which were used at least once and
+ can be reused in new flow rules.
+
+When counters are assigned to a flow rule (or allocated to indirect action),
+PMD first tries to fetch a counter from ``reuse`` ring.
+If it's empty, PMD fetches a counter from ``free`` ring.
+
+Counter service thread works as follows:
+
+#. Record counters stored in ``wait_reset`` ring.
+#. Read values of all counters which were used at least once
+ or are currently in use.
+#. Move recorded counters from ``wait_reset`` to ``reuse`` ring.
+#. Sleep for ``(query time) - svc_cycle_time`` milliseconds
+#. Repeat.
+
+Because freeing a counter (by destroying a flow rule or destroying indirect
+action) does not immediately make it available for the application,
+PMD might return:
+
+- ``ENOENT`` if no counter is available in ``free``, ``reuse``
+ or ``wait_reset`` rings.
+ No counter will be available until application releases some of them.
+- ``EAGAIN`` if no counter is available in ``free`` and ``reuse`` rings,
+ but there are counters in ``wait_reset`` ring.
+ This means that after next service thread cycle new counters will be
+ available.
+
+Application has to be aware that flow rule create or indirect action create
+might need be retried.
@@ -3734,8 +3734,11 @@ flow_hw_actions_construct(struct rte_eth_dev *dev,
case RTE_FLOW_ACTION_TYPE_COUNT:
cnt_queue = mlx5_hws_cnt_get_queue(priv, &queue);
ret = mlx5_hws_cnt_pool_get(priv->hws_cpool, cnt_queue, &cnt_id, age_idx);
- if (ret != 0)
+ if (ret != 0) {
+ rte_flow_error_set(error, -ret, RTE_FLOW_ERROR_TYPE_ACTION,
+ action, "Failed to allocate flow counter");
goto error;
+ }
ret = mlx5_hws_cnt_pool_get_action_offset
(priv->hws_cpool,
cnt_id,
@@ -3980,6 +3983,7 @@ flow_hw_async_flow_create_generic(struct rte_eth_dev *dev,
struct mlx5dr_rule_action *rule_acts;
struct rte_flow_hw *flow = NULL;
const struct rte_flow_item *rule_items;
+ struct rte_flow_error sub_error = { 0 };
uint32_t flow_idx = 0;
uint32_t res_idx = 0;
int ret;
@@ -4037,7 +4041,7 @@ flow_hw_async_flow_create_generic(struct rte_eth_dev *dev,
&table->ats[action_template_index],
table->its[pattern_template_index]->item_flags,
flow->table, actions,
- rule_acts, queue, error))
+ rule_acts, queue, &sub_error))
goto error;
rule_items = flow_hw_get_rule_items(dev, table, items,
pattern_template_index, &priv->hw_q[queue].pp);
@@ -4074,9 +4078,12 @@ flow_hw_async_flow_create_generic(struct rte_eth_dev *dev,
mlx5_ipool_free(table->resource, res_idx);
if (flow_idx)
mlx5_ipool_free(table->flow, flow_idx);
- rte_flow_error_set(error, rte_errno,
- RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
- "fail to create rte flow");
+ if (sub_error.cause != RTE_FLOW_ERROR_TYPE_NONE && error != NULL)
+ *error = sub_error;
+ else
+ rte_flow_error_set(error, rte_errno,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "fail to create rte flow");
return NULL;
}
@@ -56,26 +56,29 @@ __mlx5_hws_cnt_svc(struct mlx5_dev_ctx_shared *sh,
uint32_t ret __rte_unused;
reset_cnt_num = rte_ring_count(reset_list);
- do {
- cpool->query_gen++;
- mlx5_aso_cnt_query(sh, cpool);
- zcdr.n1 = 0;
- zcdu.n1 = 0;
- ret = rte_ring_enqueue_zc_burst_elem_start(reuse_list,
- sizeof(cnt_id_t),
- reset_cnt_num, &zcdu,
- NULL);
- MLX5_ASSERT(ret == reset_cnt_num);
- ret = rte_ring_dequeue_zc_burst_elem_start(reset_list,
- sizeof(cnt_id_t),
- reset_cnt_num, &zcdr,
- NULL);
- MLX5_ASSERT(ret == reset_cnt_num);
- __hws_cnt_r2rcpy(&zcdu, &zcdr, reset_cnt_num);
- rte_ring_dequeue_zc_elem_finish(reset_list, reset_cnt_num);
- rte_ring_enqueue_zc_elem_finish(reuse_list, reset_cnt_num);
+ cpool->query_gen++;
+ mlx5_aso_cnt_query(sh, cpool);
+ zcdr.n1 = 0;
+ zcdu.n1 = 0;
+ ret = rte_ring_enqueue_zc_burst_elem_start(reuse_list,
+ sizeof(cnt_id_t),
+ reset_cnt_num, &zcdu,
+ NULL);
+ MLX5_ASSERT(ret == reset_cnt_num);
+ ret = rte_ring_dequeue_zc_burst_elem_start(reset_list,
+ sizeof(cnt_id_t),
+ reset_cnt_num, &zcdr,
+ NULL);
+ MLX5_ASSERT(ret == reset_cnt_num);
+ __hws_cnt_r2rcpy(&zcdu, &zcdr, reset_cnt_num);
+ rte_ring_dequeue_zc_elem_finish(reset_list, reset_cnt_num);
+ rte_ring_enqueue_zc_elem_finish(reuse_list, reset_cnt_num);
+
+ if (rte_log_can_log(mlx5_logtype, RTE_LOG_DEBUG)) {
reset_cnt_num = rte_ring_count(reset_list);
- } while (reset_cnt_num > 0);
+ DRV_LOG(DEBUG, "ibdev %s cpool %p wait_reset_cnt=%" PRIu32,
+ sh->ibdev_name, (void *)cpool, reset_cnt_num);
+ }
}
/**
@@ -325,6 +328,11 @@ mlx5_hws_cnt_svc(void *opaque)
rte_spinlock_unlock(&sh->cpool_lock);
query_us = query_cycle / (rte_get_timer_hz() / US_PER_S);
sleep_us = interval - query_us;
+ DRV_LOG(DEBUG, "ibdev %s counter service thread: "
+ "interval_us=%" PRIu64 " query_us=%" PRIu64 " "
+ "sleep_us=%" PRIu64,
+ sh->ibdev_name, interval, query_us,
+ interval > query_us ? sleep_us : 0);
if (interval > query_us)
rte_delay_us_sleep(sleep_us);
}