app/testeventdev: add vector worker to perf test

Message ID 20221202101139.1522945-1-vfialko@marvell.com (mailing list archive)
State Accepted, archived
Delegated to: Jerin Jacob
Headers
Series app/testeventdev: add vector worker to perf test |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/github-robot: build fail github build: failed
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-aarch64-unit-testing warning Testing issues
ci/iol-aarch64-compile-testing fail Testing issues
ci/iol-testing fail Testing issues
ci/iol-x86_64-unit-testing success Testing PASS
ci/iol-x86_64-compile-testing fail Testing issues
ci/Intel-compilation fail Compilation issues
ci/intel-Testing success Testing PASS

Commit Message

Volodymyr Fialko Dec. 2, 2022, 10:11 a.m. UTC
  Add worker for handling vector events to perf tests, vector events could
be generated by crypto adapter producer.

Example:
    ./dpdk-test-eventdev -l 0-2 -a <EVENTDEV> -a <CRYPTODEV> -- \
    --prod_type_cryptodev --crypto_adptr_mode 1 --test=perf_queue \
    --stlist=a --wlcores 1 --plcores 2 --prod_enq_burst_sz 32 \
    --enable_vector --vector_tmo_ns 0 --nb_flows 2

Signed-off-by: Volodymyr Fialko <vfialko@marvell.com>
---
 app/test-eventdev/test_perf_atq.c    |  62 ++++++++++++++--
 app/test-eventdev/test_perf_common.c |  68 +++++++++++++++---
 app/test-eventdev/test_perf_common.h | 102 ++++++++++++++++++++++++++-
 app/test-eventdev/test_perf_queue.c  |  63 +++++++++++++++--
 doc/guides/tools/testeventdev.rst    |  12 ++--
 5 files changed, 279 insertions(+), 28 deletions(-)
  

Comments

Volodymyr Fialko Dec. 6, 2022, 9:27 a.m. UTC | #1
> -----Original Message-----
> From: Volodymyr Fialko <vfialko@marvell.com>
> Sent: Friday, December 2, 2022 11:12 AM
> To: dev@dpdk.org; Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Cc: Anoob Joseph <anoobj@marvell.com>; Akhil Goyal <gakhil@marvell.com>; Volodymyr Fialko
> <vfialko@marvell.com>
> Subject: [PATCH] app/testeventdev: add vector worker to perf test
> 
> Add worker for handling vector events to perf tests, vector events could be generated by crypto adapter
> producer.
> 
> Example:
>     ./dpdk-test-eventdev -l 0-2 -a <EVENTDEV> -a <CRYPTODEV> -- \
>     --prod_type_cryptodev --crypto_adptr_mode 1 --test=perf_queue \
>     --stlist=a --wlcores 1 --plcores 2 --prod_enq_burst_sz 32 \
>     --enable_vector --vector_tmo_ns 0 --nb_flows 2
> 
> Signed-off-by: Volodymyr Fialko <vfialko@marvell.com>
> ---
Depens-on: series-26008 ("build: fix missing crypto vec limits in version")
  
Shijith Thotton Jan. 17, 2023, 3:29 p.m. UTC | #2
>Add worker for handling vector events to perf tests, vector events could
>be generated by crypto adapter producer.
>
>Example:
>    ./dpdk-test-eventdev -l 0-2 -a <EVENTDEV> -a <CRYPTODEV> -- \
>    --prod_type_cryptodev --crypto_adptr_mode 1 --test=perf_queue \
>    --stlist=a --wlcores 1 --plcores 2 --prod_enq_burst_sz 32 \
>    --enable_vector --vector_tmo_ns 0 --nb_flows 2
>
>Signed-off-by: Volodymyr Fialko <vfialko@marvell.com>
 
Acked-by: Shijith Thotton <sthotton@marvell.com>

>---
> app/test-eventdev/test_perf_atq.c    |  62 ++++++++++++++--
> app/test-eventdev/test_perf_common.c |  68 +++++++++++++++---
> app/test-eventdev/test_perf_common.h | 102
>++++++++++++++++++++++++++-
> app/test-eventdev/test_perf_queue.c  |  63 +++++++++++++++--
> doc/guides/tools/testeventdev.rst    |  12 ++--
> 5 files changed, 279 insertions(+), 28 deletions(-)
>
>diff --git a/app/test-eventdev/test_perf_atq.c b/app/test-
>eventdev/test_perf_atq.c
>index 9d30081117..4ac60cc38b 100644
>--- a/app/test-eventdev/test_perf_atq.c
>+++ b/app/test-eventdev/test_perf_atq.c
>@@ -24,14 +24,22 @@ atq_fwd_event(struct rte_event *const ev, uint8_t *const
>sched_type_list,
> 	ev->event_type = RTE_EVENT_TYPE_CPU;
> }
>
>+static __rte_always_inline void
>+atq_fwd_event_vector(struct rte_event *const ev, uint8_t *const
>sched_type_list,
>+		const uint8_t nb_stages)
>+{
>+	ev->sub_event_type++;
>+	ev->sched_type = sched_type_list[ev->sub_event_type % nb_stages];
>+	ev->op = RTE_EVENT_OP_FORWARD;
>+	ev->event_type = RTE_EVENT_TYPE_CPU_VECTOR;
>+}
>+
> static int
> perf_atq_worker(void *arg, const int enable_fwd_latency)
> {
>-	struct perf_elt *pe = NULL;
> 	uint16_t enq = 0, deq = 0;
> 	struct rte_event ev;
> 	PERF_WORKER_INIT;
>-	uint8_t stage;
>
> 	while (t->done == false) {
> 		deq = rte_event_dequeue_burst(dev, port, &ev, 1, 0);
>@@ -79,9 +87,7 @@ perf_atq_worker_burst(void *arg, const int
>enable_fwd_latency)
> 	/* +1 to avoid prefetch out of array check */
> 	struct rte_event ev[BURST_SIZE + 1];
> 	uint16_t enq = 0, nb_rx = 0;
>-	struct perf_elt *pe = NULL;
> 	PERF_WORKER_INIT;
>-	uint8_t stage;
> 	uint16_t i;
>
> 	while (t->done == false) {
>@@ -134,6 +140,50 @@ perf_atq_worker_burst(void *arg, const int
>enable_fwd_latency)
> 	return 0;
> }
>
>+static int
>+perf_atq_worker_vector(void *arg, const int enable_fwd_latency)
>+{
>+	uint16_t enq = 0, deq = 0;
>+	struct rte_event ev;
>+	PERF_WORKER_INIT;
>+
>+	RTE_SET_USED(sz);
>+	RTE_SET_USED(cnt);
>+	RTE_SET_USED(prod_crypto_type);
>+
>+	while (t->done == false) {
>+		deq = rte_event_dequeue_burst(dev, port, &ev, 1, 0);
>+
>+		if (!deq)
>+			continue;
>+
>+		if (ev.event_type == RTE_EVENT_TYPE_CRYPTODEV_VECTOR) {
>+			if (perf_handle_crypto_vector_ev(&ev, &pe,
>enable_fwd_latency))
>+				continue;
>+		}
>+
>+		stage = ev.sub_event_type % nb_stages;
>+		/* First q in pipeline, mark timestamp to compute fwd latency */
>+		if (enable_fwd_latency && !prod_timer_type && stage == 0)
>+			perf_mark_fwd_latency(pe);
>+
>+		/* Last stage in pipeline */
>+		if (unlikely(stage == laststage)) {
>+			perf_process_vector_last_stage(pool, t->ca_op_pool,
>&ev, w,
>+							enable_fwd_latency);
>+		} else {
>+			atq_fwd_event_vector(&ev, sched_type_list, nb_stages);
>+			do {
>+				enq = rte_event_enqueue_burst(dev, port, &ev,
>1);
>+			} while (!enq && !t->done);
>+		}
>+	}
>+
>+	perf_worker_cleanup(pool, dev, port, &ev, enq, deq);
>+
>+	return 0;
>+}
>+
> static int
> worker_wrapper(void *arg)
> {
>@@ -144,7 +194,9 @@ worker_wrapper(void *arg)
> 	const int fwd_latency = opt->fwd_latency;
>
> 	/* allow compiler to optimize */
>-	if (!burst && !fwd_latency)
>+	if (opt->ena_vector && opt->prod_type ==
>EVT_PROD_TYPE_EVENT_CRYPTO_ADPTR)
>+		return perf_atq_worker_vector(arg, fwd_latency);
>+	else if (!burst && !fwd_latency)
> 		return perf_atq_worker(arg, 0);
> 	else if (!burst && fwd_latency)
> 		return perf_atq_worker(arg, 1);
>diff --git a/app/test-eventdev/test_perf_common.c b/app/test-
>eventdev/test_perf_common.c
>index 140c0c2dc3..8d7e483c55 100644
>--- a/app/test-eventdev/test_perf_common.c
>+++ b/app/test-eventdev/test_perf_common.c
>@@ -827,10 +827,13 @@ perf_event_timer_adapter_setup(struct test_perf *t)
> static int
> perf_event_crypto_adapter_setup(struct test_perf *t, struct prod_data *p)
> {
>+	struct rte_event_crypto_adapter_queue_conf conf;
> 	struct evt_options *opt = t->opt;
> 	uint32_t cap;
> 	int ret;
>
>+	memset(&conf, 0, sizeof(conf));
>+
> 	ret = rte_event_crypto_adapter_caps_get(p->dev_id, p->ca.cdev_id,
>&cap);
> 	if (ret) {
> 		evt_err("Failed to get crypto adapter capabilities");
>@@ -849,19 +852,53 @@ perf_event_crypto_adapter_setup(struct test_perf *t,
>struct prod_data *p)
> 		return -ENOTSUP;
> 	}
>
>-	if (cap &
>RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_QP_EV_BIND) {
>-		struct rte_event_crypto_adapter_queue_conf conf;
>+	if (opt->ena_vector) {
>+		struct rte_event_crypto_adapter_vector_limits limits;
>+
>+		if (!(cap & RTE_EVENT_CRYPTO_ADAPTER_CAP_EVENT_VECTOR))
>{
>+			evt_err("Crypto adapter doesn't support event vector");
>+			return -EINVAL;
>+		}
>+
>+		ret = rte_event_crypto_adapter_vector_limits_get(p->dev_id, p-
>>ca.cdev_id, &limits);
>+		if (ret) {
>+			evt_err("Failed to get crypto adapter's vector limits");
>+			return ret;
>+		}
>
>-		memset(&conf, 0, sizeof(conf));
>+		if (opt->vector_size < limits.min_sz || opt->vector_size >
>limits.max_sz) {
>+			evt_err("Vector size [%d] not within limits max[%d]
>min[%d]",
>+				opt->vector_size, limits.max_sz, limits.min_sz);
>+			return -EINVAL;
>+		}
>+
>+		if (limits.log2_sz && !rte_is_power_of_2(opt->vector_size)) {
>+			evt_err("Vector size [%d] not power of 2", opt-
>>vector_size);
>+			return -EINVAL;
>+		}
>+
>+		if (opt->vector_tmo_nsec > limits.max_timeout_ns ||
>+			opt->vector_tmo_nsec < limits.min_timeout_ns) {
>+			evt_err("Vector timeout [%" PRIu64 "] not within limits "
>+				"max[%" PRIu64 "] min[%" PRIu64 "]",
>+				opt->vector_tmo_nsec, limits.max_timeout_ns,
>limits.min_timeout_ns);
>+			return -EINVAL;
>+		}
>+
>+		conf.vector_mp = t->ca_vector_pool;
>+		conf.vector_sz = opt->vector_size;
>+		conf.vector_timeout_ns = opt->vector_tmo_nsec;
>+		conf.flags |= RTE_EVENT_CRYPTO_ADAPTER_EVENT_VECTOR;
>+	}
>+
>+	if (cap &
>RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_QP_EV_BIND) {
> 		conf.ev.sched_type = RTE_SCHED_TYPE_ATOMIC;
> 		conf.ev.queue_id = p->queue_id;
>-		ret = rte_event_crypto_adapter_queue_pair_add(
>-			TEST_PERF_CA_ID, p->ca.cdev_id, p->ca.cdev_qp_id,
>&conf);
>-	} else {
>-		ret = rte_event_crypto_adapter_queue_pair_add(
>-			TEST_PERF_CA_ID, p->ca.cdev_id, p->ca.cdev_qp_id,
>NULL);
> 	}
>
>+	ret = rte_event_crypto_adapter_queue_pair_add(
>+		TEST_PERF_CA_ID, p->ca.cdev_id, p->ca.cdev_qp_id, &conf);
>+
> 	return ret;
> }
>
>@@ -1411,6 +1448,19 @@ perf_cryptodev_setup(struct evt_test *test, struct
>evt_options *opt)
> 		goto err;
> 	}
>
>+	if (opt->ena_vector) {
>+		unsigned int nb_elem = (opt->pool_sz / opt->vector_size) * 2;
>+		nb_elem = RTE_MAX(512U, nb_elem);
>+		nb_elem += evt_nr_active_lcores(opt->wlcores) * 32;
>+		t->ca_vector_pool =
>rte_event_vector_pool_create("vector_pool", nb_elem, 32,
>+				opt->vector_size, opt->socket_id);
>+		if (t->ca_vector_pool == NULL) {
>+			evt_err("Failed to create event vector pool");
>+			ret = -ENOMEM;
>+			goto err;
>+		}
>+	}
>+
> 	/*
> 	 * Calculate number of needed queue pairs, based on the amount of
> 	 * available number of logical cores and crypto devices. For instance,
>@@ -1467,6 +1517,7 @@ perf_cryptodev_setup(struct evt_test *test, struct
>evt_options *opt)
> 	rte_mempool_free(t->ca_op_pool);
> 	rte_mempool_free(t->ca_sess_pool);
> 	rte_mempool_free(t->ca_asym_sess_pool);
>+	rte_mempool_free(t->ca_vector_pool);
>
> 	return ret;
> }
>@@ -1507,6 +1558,7 @@ perf_cryptodev_destroy(struct evt_test *test, struct
>evt_options *opt)
> 	rte_mempool_free(t->ca_op_pool);
> 	rte_mempool_free(t->ca_sess_pool);
> 	rte_mempool_free(t->ca_asym_sess_pool);
>+	rte_mempool_free(t->ca_vector_pool);
> }
>
> int
>diff --git a/app/test-eventdev/test_perf_common.h b/app/test-
>eventdev/test_perf_common.h
>index 503b6aa1db..faedd471c6 100644
>--- a/app/test-eventdev/test_perf_common.h
>+++ b/app/test-eventdev/test_perf_common.h
>@@ -71,6 +71,7 @@ struct test_perf {
> 	struct rte_mempool *ca_op_pool;
> 	struct rte_mempool *ca_sess_pool;
> 	struct rte_mempool *ca_asym_sess_pool;
>+	struct rte_mempool *ca_vector_pool;
> } __rte_cache_aligned;
>
> struct perf_elt {
>@@ -103,6 +104,8 @@ struct perf_elt {
> 	uint8_t cnt = 0;\
> 	void *bufs[16] __rte_cache_aligned;\
> 	int const sz = RTE_DIM(bufs);\
>+	uint8_t stage;\
>+	struct perf_elt *pe = NULL;\
> 	if (opt->verbose_level > 1)\
> 		printf("%s(): lcore %d dev_id %d port=%d\n", __func__,\
> 				rte_lcore_id(), dev, port)
>@@ -143,6 +146,64 @@ perf_handle_crypto_ev(struct rte_event *ev, struct
>perf_elt **pe, int enable_fwd
> 	return 0;
> }
>
>+static __rte_always_inline struct perf_elt *
>+perf_elt_from_vec_get(struct rte_event_vector *vec)
>+{
>+	/* Timestamp for vector event stored in first element */
>+	struct rte_crypto_op *cop = vec->ptrs[0];
>+	struct rte_mbuf *m;
>+
>+	if (cop->type == RTE_CRYPTO_OP_TYPE_SYMMETRIC) {
>+		m = cop->sym->m_dst == NULL ? cop->sym->m_src : cop->sym-
>>m_dst;
>+		return rte_pktmbuf_mtod(m, struct perf_elt *);
>+	} else {
>+		return RTE_PTR_ADD(cop->asym->modex.result.data, cop-
>>asym->modex.result.length);
>+	}
>+}
>+
>+static __rte_always_inline int
>+perf_handle_crypto_vector_ev(struct rte_event *ev, struct perf_elt **pe,
>+		const int enable_fwd_latency)
>+{
>+	struct rte_event_vector *vec = ev->vec;
>+	struct rte_crypto_op *cop;
>+	struct rte_mbuf *m;
>+	int i, n = 0;
>+	void *data;
>+
>+	for (i = 0; i < vec->nb_elem; i++) {
>+		cop = vec->ptrs[i];
>+		if (unlikely(cop->status != RTE_CRYPTO_OP_STATUS_SUCCESS)) {
>+			if (cop->type == RTE_CRYPTO_OP_TYPE_SYMMETRIC) {
>+				m = cop->sym->m_dst == NULL ? cop->sym-
>>m_src : cop->sym->m_dst;
>+				rte_pktmbuf_free(m);
>+			} else {
>+				data = cop->asym->modex.result.data;
>+
>	rte_mempool_put(rte_mempool_from_obj(data), data);
>+			}
>+			rte_crypto_op_free(cop);
>+			continue;
>+		}
>+		vec->ptrs[n++] = cop;
>+	}
>+
>+	/* All cops failed, free the vector */
>+	if (n == 0) {
>+		rte_mempool_put(rte_mempool_from_obj(vec), vec);
>+		return -ENOENT;
>+	}
>+
>+	vec->nb_elem = n;
>+
>+	/* Forward latency not enabled - perf data will be not accessed */
>+	if (!enable_fwd_latency)
>+		return 0;
>+
>+	/* Get pointer to perf data */
>+	*pe = perf_elt_from_vec_get(vec);
>+
>+	return 0;
>+}
>
> static __rte_always_inline int
> perf_process_last_stage(struct rte_mempool *const pool, uint8_t
>prod_crypto_type,
>@@ -195,9 +256,8 @@ perf_process_last_stage_latency(struct rte_mempool
>*const pool, uint8_t prod_cry
> 	struct perf_elt *pe;
> 	void *to_free_in_bulk;
>
>-	/* release fence here ensures event_prt is
>-	 * stored before updating the number of
>-	 * processed packets for worker lcores
>+	/* Release fence here ensures event_prt is stored before updating the
>number of processed
>+	 * packets for worker lcores.
> 	 */
> 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
> 	w->processed_pkts++;
>@@ -237,6 +297,42 @@ perf_process_last_stage_latency(struct rte_mempool
>*const pool, uint8_t prod_cry
> 	return count;
> }
>
>+static __rte_always_inline void
>+perf_process_vector_last_stage(struct rte_mempool *const pool,
>+		struct rte_mempool *const ca_pool, struct rte_event *const ev,
>+		struct worker_data *const w, const bool enable_fwd_latency)
>+{
>+	struct rte_event_vector *vec = ev->vec;
>+	struct rte_crypto_op *cop;
>+	void *bufs[vec->nb_elem];
>+	struct perf_elt *pe;
>+	uint64_t latency;
>+	int i;
>+
>+	/* Release fence here ensures event_prt is stored before updating the
>number of processed
>+	 * packets for worker lcores.
>+	 */
>+	rte_atomic_thread_fence(__ATOMIC_RELEASE);
>+	w->processed_pkts += vec->nb_elem;
>+
>+	if (enable_fwd_latency) {
>+		pe = perf_elt_from_vec_get(vec);
>+		latency = rte_get_timer_cycles() - pe->timestamp;
>+		w->latency += latency;
>+	}
>+
>+	for (i = 0; i < vec->nb_elem; i++) {
>+		cop = vec->ptrs[i];
>+		if (cop->type == RTE_CRYPTO_OP_TYPE_SYMMETRIC)
>+			bufs[i] = cop->sym->m_dst == NULL ? cop->sym->m_src :
>cop->sym->m_dst;
>+		else
>+			bufs[i] = cop->asym->modex.result.data;
>+	}
>+
>+	rte_mempool_put_bulk(pool, bufs, vec->nb_elem);
>+	rte_mempool_put_bulk(ca_pool, (void * const *)vec->ptrs, vec-
>>nb_elem);
>+	rte_mempool_put(rte_mempool_from_obj(vec), vec);
>+}
>
> static inline int
> perf_nb_event_ports(struct evt_options *opt)
>diff --git a/app/test-eventdev/test_perf_queue.c b/app/test-
>eventdev/test_perf_queue.c
>index 69ef0ebbac..2399cfb69b 100644
>--- a/app/test-eventdev/test_perf_queue.c
>+++ b/app/test-eventdev/test_perf_queue.c
>@@ -25,15 +25,22 @@ fwd_event(struct rte_event *const ev, uint8_t *const
>sched_type_list,
> 	ev->event_type = RTE_EVENT_TYPE_CPU;
> }
>
>+static __rte_always_inline void
>+fwd_event_vector(struct rte_event *const ev, uint8_t *const sched_type_list,
>+		const uint8_t nb_stages)
>+{
>+	ev->queue_id++;
>+	ev->sched_type = sched_type_list[ev->queue_id % nb_stages];
>+	ev->op = RTE_EVENT_OP_FORWARD;
>+	ev->event_type = RTE_EVENT_TYPE_CPU_VECTOR;
>+}
>+
> static int
> perf_queue_worker(void *arg, const int enable_fwd_latency)
> {
>-	struct perf_elt *pe = NULL;
> 	uint16_t enq = 0, deq = 0;
> 	struct rte_event ev;
> 	PERF_WORKER_INIT;
>-	uint8_t stage;
>-
>
> 	while (t->done == false) {
> 		deq = rte_event_dequeue_burst(dev, port, &ev, 1, 0);
>@@ -82,9 +89,7 @@ perf_queue_worker_burst(void *arg, const int
>enable_fwd_latency)
> 	/* +1 to avoid prefetch out of array check */
> 	struct rte_event ev[BURST_SIZE + 1];
> 	uint16_t enq = 0, nb_rx = 0;
>-	struct perf_elt *pe = NULL;
> 	PERF_WORKER_INIT;
>-	uint8_t stage;
> 	uint16_t i;
>
> 	while (t->done == false) {
>@@ -137,6 +142,50 @@ perf_queue_worker_burst(void *arg, const int
>enable_fwd_latency)
> 	return 0;
> }
>
>+static int
>+perf_queue_worker_vector(void *arg, const int enable_fwd_latency)
>+{
>+	uint16_t enq = 0, deq = 0;
>+	struct rte_event ev;
>+	PERF_WORKER_INIT;
>+
>+	RTE_SET_USED(sz);
>+	RTE_SET_USED(cnt);
>+	RTE_SET_USED(prod_crypto_type);
>+
>+	while (t->done == false) {
>+		deq = rte_event_dequeue_burst(dev, port, &ev, 1, 0);
>+
>+		if (!deq)
>+			continue;
>+
>+		if (ev.event_type == RTE_EVENT_TYPE_CRYPTODEV_VECTOR) {
>+			if (perf_handle_crypto_vector_ev(&ev, &pe,
>enable_fwd_latency))
>+				continue;
>+		}
>+
>+		stage = ev.queue_id % nb_stages;
>+		/* First q in pipeline, mark timestamp to compute fwd latency */
>+		if (enable_fwd_latency && !prod_timer_type && stage == 0)
>+			perf_mark_fwd_latency(pe);
>+
>+		/* Last stage in pipeline */
>+		if (unlikely(stage == laststage)) {
>+			perf_process_vector_last_stage(pool, t->ca_op_pool,
>&ev, w,
>+						       enable_fwd_latency);
>+		} else {
>+			fwd_event_vector(&ev, sched_type_list, nb_stages);
>+			do {
>+				enq = rte_event_enqueue_burst(dev, port, &ev,
>1);
>+			} while (!enq && !t->done);
>+		}
>+	}
>+
>+	perf_worker_cleanup(pool, dev, port, &ev, enq, deq);
>+
>+	return 0;
>+}
>+
> static int
> worker_wrapper(void *arg)
> {
>@@ -147,7 +196,9 @@ worker_wrapper(void *arg)
> 	const int fwd_latency = opt->fwd_latency;
>
> 	/* allow compiler to optimize */
>-	if (!burst && !fwd_latency)
>+	if (opt->ena_vector && opt->prod_type ==
>EVT_PROD_TYPE_EVENT_CRYPTO_ADPTR)
>+		return perf_queue_worker_vector(arg, fwd_latency);
>+	else if (!burst && !fwd_latency)
> 		return perf_queue_worker(arg, 0);
> 	else if (!burst && fwd_latency)
> 		return perf_queue_worker(arg, 1);
>diff --git a/doc/guides/tools/testeventdev.rst
>b/doc/guides/tools/testeventdev.rst
>index cd278e8998..6f065b9752 100644
>--- a/doc/guides/tools/testeventdev.rst
>+++ b/doc/guides/tools/testeventdev.rst
>@@ -185,18 +185,18 @@ The following are the application command-line options:
>
> * ``--enable_vector``
>
>-       Enable event vector for Rx/Tx adapters.
>-       Only applicable for `pipeline_atq` and `pipeline_queue` tests.
>+       Enable event vector for Rx/Tx/crypto adapters.
>+       Only applicable for `pipeline_*` and `perf_*` tests.
>
> * ``--vector_size``
>
>-       Vector size to configure for the Rx adapter.
>-       Only applicable for `pipeline_atq` and `pipeline_queue` tests.
>+       Vector size to configure for the Rx/crypto adapter.
>+       Only applicable for `pipeline_*` and `perf_*` tests.
>
> * ``--vector_tmo_ns``
>
>-       Vector timeout nanoseconds to be configured for the Rx adapter.
>-       Only applicable for `pipeline_atq` and `pipeline_queue` tests.
>+       Vector timeout nanoseconds to be configured for the Rx/crypto adapter.
>+       Only applicable for `pipeline_*` and `perf_*` tests.
>
> * ``--per_port_pool``
>
>--
>2.25.1
  
Jerin Jacob Jan. 24, 2023, 11:53 a.m. UTC | #3
On Tue, Jan 17, 2023 at 8:59 PM Shijith Thotton <sthotton@marvell.com> wrote:
>
>
> >Add worker for handling vector events to perf tests, vector events could
> >be generated by crypto adapter producer.
> >
> >Example:
> >    ./dpdk-test-eventdev -l 0-2 -a <EVENTDEV> -a <CRYPTODEV> -- \
> >    --prod_type_cryptodev --crypto_adptr_mode 1 --test=perf_queue \
> >    --stlist=a --wlcores 1 --plcores 2 --prod_enq_burst_sz 32 \
> >    --enable_vector --vector_tmo_ns 0 --nb_flows 2
> >
> >Signed-off-by: Volodymyr Fialko <vfialko@marvell.com>
>
> Acked-by: Shijith Thotton <sthotton@marvell.com>

Applied to dpdk-next-net-eventdev/for-main. Thanks
  

Patch

diff --git a/app/test-eventdev/test_perf_atq.c b/app/test-eventdev/test_perf_atq.c
index 9d30081117..4ac60cc38b 100644
--- a/app/test-eventdev/test_perf_atq.c
+++ b/app/test-eventdev/test_perf_atq.c
@@ -24,14 +24,22 @@  atq_fwd_event(struct rte_event *const ev, uint8_t *const sched_type_list,
 	ev->event_type = RTE_EVENT_TYPE_CPU;
 }
 
+static __rte_always_inline void
+atq_fwd_event_vector(struct rte_event *const ev, uint8_t *const sched_type_list,
+		const uint8_t nb_stages)
+{
+	ev->sub_event_type++;
+	ev->sched_type = sched_type_list[ev->sub_event_type % nb_stages];
+	ev->op = RTE_EVENT_OP_FORWARD;
+	ev->event_type = RTE_EVENT_TYPE_CPU_VECTOR;
+}
+
 static int
 perf_atq_worker(void *arg, const int enable_fwd_latency)
 {
-	struct perf_elt *pe = NULL;
 	uint16_t enq = 0, deq = 0;
 	struct rte_event ev;
 	PERF_WORKER_INIT;
-	uint8_t stage;
 
 	while (t->done == false) {
 		deq = rte_event_dequeue_burst(dev, port, &ev, 1, 0);
@@ -79,9 +87,7 @@  perf_atq_worker_burst(void *arg, const int enable_fwd_latency)
 	/* +1 to avoid prefetch out of array check */
 	struct rte_event ev[BURST_SIZE + 1];
 	uint16_t enq = 0, nb_rx = 0;
-	struct perf_elt *pe = NULL;
 	PERF_WORKER_INIT;
-	uint8_t stage;
 	uint16_t i;
 
 	while (t->done == false) {
@@ -134,6 +140,50 @@  perf_atq_worker_burst(void *arg, const int enable_fwd_latency)
 	return 0;
 }
 
+static int
+perf_atq_worker_vector(void *arg, const int enable_fwd_latency)
+{
+	uint16_t enq = 0, deq = 0;
+	struct rte_event ev;
+	PERF_WORKER_INIT;
+
+	RTE_SET_USED(sz);
+	RTE_SET_USED(cnt);
+	RTE_SET_USED(prod_crypto_type);
+
+	while (t->done == false) {
+		deq = rte_event_dequeue_burst(dev, port, &ev, 1, 0);
+
+		if (!deq)
+			continue;
+
+		if (ev.event_type == RTE_EVENT_TYPE_CRYPTODEV_VECTOR) {
+			if (perf_handle_crypto_vector_ev(&ev, &pe, enable_fwd_latency))
+				continue;
+		}
+
+		stage = ev.sub_event_type % nb_stages;
+		/* First q in pipeline, mark timestamp to compute fwd latency */
+		if (enable_fwd_latency && !prod_timer_type && stage == 0)
+			perf_mark_fwd_latency(pe);
+
+		/* Last stage in pipeline */
+		if (unlikely(stage == laststage)) {
+			perf_process_vector_last_stage(pool, t->ca_op_pool, &ev, w,
+							enable_fwd_latency);
+		} else {
+			atq_fwd_event_vector(&ev, sched_type_list, nb_stages);
+			do {
+				enq = rte_event_enqueue_burst(dev, port, &ev, 1);
+			} while (!enq && !t->done);
+		}
+	}
+
+	perf_worker_cleanup(pool, dev, port, &ev, enq, deq);
+
+	return 0;
+}
+
 static int
 worker_wrapper(void *arg)
 {
@@ -144,7 +194,9 @@  worker_wrapper(void *arg)
 	const int fwd_latency = opt->fwd_latency;
 
 	/* allow compiler to optimize */
-	if (!burst && !fwd_latency)
+	if (opt->ena_vector && opt->prod_type == EVT_PROD_TYPE_EVENT_CRYPTO_ADPTR)
+		return perf_atq_worker_vector(arg, fwd_latency);
+	else if (!burst && !fwd_latency)
 		return perf_atq_worker(arg, 0);
 	else if (!burst && fwd_latency)
 		return perf_atq_worker(arg, 1);
diff --git a/app/test-eventdev/test_perf_common.c b/app/test-eventdev/test_perf_common.c
index 140c0c2dc3..8d7e483c55 100644
--- a/app/test-eventdev/test_perf_common.c
+++ b/app/test-eventdev/test_perf_common.c
@@ -827,10 +827,13 @@  perf_event_timer_adapter_setup(struct test_perf *t)
 static int
 perf_event_crypto_adapter_setup(struct test_perf *t, struct prod_data *p)
 {
+	struct rte_event_crypto_adapter_queue_conf conf;
 	struct evt_options *opt = t->opt;
 	uint32_t cap;
 	int ret;
 
+	memset(&conf, 0, sizeof(conf));
+
 	ret = rte_event_crypto_adapter_caps_get(p->dev_id, p->ca.cdev_id, &cap);
 	if (ret) {
 		evt_err("Failed to get crypto adapter capabilities");
@@ -849,19 +852,53 @@  perf_event_crypto_adapter_setup(struct test_perf *t, struct prod_data *p)
 		return -ENOTSUP;
 	}
 
-	if (cap & RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_QP_EV_BIND) {
-		struct rte_event_crypto_adapter_queue_conf conf;
+	if (opt->ena_vector) {
+		struct rte_event_crypto_adapter_vector_limits limits;
+
+		if (!(cap & RTE_EVENT_CRYPTO_ADAPTER_CAP_EVENT_VECTOR)) {
+			evt_err("Crypto adapter doesn't support event vector");
+			return -EINVAL;
+		}
+
+		ret = rte_event_crypto_adapter_vector_limits_get(p->dev_id, p->ca.cdev_id, &limits);
+		if (ret) {
+			evt_err("Failed to get crypto adapter's vector limits");
+			return ret;
+		}
 
-		memset(&conf, 0, sizeof(conf));
+		if (opt->vector_size < limits.min_sz || opt->vector_size > limits.max_sz) {
+			evt_err("Vector size [%d] not within limits max[%d] min[%d]",
+				opt->vector_size, limits.max_sz, limits.min_sz);
+			return -EINVAL;
+		}
+
+		if (limits.log2_sz && !rte_is_power_of_2(opt->vector_size)) {
+			evt_err("Vector size [%d] not power of 2", opt->vector_size);
+			return -EINVAL;
+		}
+
+		if (opt->vector_tmo_nsec > limits.max_timeout_ns ||
+			opt->vector_tmo_nsec < limits.min_timeout_ns) {
+			evt_err("Vector timeout [%" PRIu64 "] not within limits "
+				"max[%" PRIu64 "] min[%" PRIu64 "]",
+				opt->vector_tmo_nsec, limits.max_timeout_ns, limits.min_timeout_ns);
+			return -EINVAL;
+		}
+
+		conf.vector_mp = t->ca_vector_pool;
+		conf.vector_sz = opt->vector_size;
+		conf.vector_timeout_ns = opt->vector_tmo_nsec;
+		conf.flags |= RTE_EVENT_CRYPTO_ADAPTER_EVENT_VECTOR;
+	}
+
+	if (cap & RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_QP_EV_BIND) {
 		conf.ev.sched_type = RTE_SCHED_TYPE_ATOMIC;
 		conf.ev.queue_id = p->queue_id;
-		ret = rte_event_crypto_adapter_queue_pair_add(
-			TEST_PERF_CA_ID, p->ca.cdev_id, p->ca.cdev_qp_id, &conf);
-	} else {
-		ret = rte_event_crypto_adapter_queue_pair_add(
-			TEST_PERF_CA_ID, p->ca.cdev_id, p->ca.cdev_qp_id, NULL);
 	}
 
+	ret = rte_event_crypto_adapter_queue_pair_add(
+		TEST_PERF_CA_ID, p->ca.cdev_id, p->ca.cdev_qp_id, &conf);
+
 	return ret;
 }
 
@@ -1411,6 +1448,19 @@  perf_cryptodev_setup(struct evt_test *test, struct evt_options *opt)
 		goto err;
 	}
 
+	if (opt->ena_vector) {
+		unsigned int nb_elem = (opt->pool_sz / opt->vector_size) * 2;
+		nb_elem = RTE_MAX(512U, nb_elem);
+		nb_elem += evt_nr_active_lcores(opt->wlcores) * 32;
+		t->ca_vector_pool = rte_event_vector_pool_create("vector_pool", nb_elem, 32,
+				opt->vector_size, opt->socket_id);
+		if (t->ca_vector_pool == NULL) {
+			evt_err("Failed to create event vector pool");
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
 	/*
 	 * Calculate number of needed queue pairs, based on the amount of
 	 * available number of logical cores and crypto devices. For instance,
@@ -1467,6 +1517,7 @@  perf_cryptodev_setup(struct evt_test *test, struct evt_options *opt)
 	rte_mempool_free(t->ca_op_pool);
 	rte_mempool_free(t->ca_sess_pool);
 	rte_mempool_free(t->ca_asym_sess_pool);
+	rte_mempool_free(t->ca_vector_pool);
 
 	return ret;
 }
@@ -1507,6 +1558,7 @@  perf_cryptodev_destroy(struct evt_test *test, struct evt_options *opt)
 	rte_mempool_free(t->ca_op_pool);
 	rte_mempool_free(t->ca_sess_pool);
 	rte_mempool_free(t->ca_asym_sess_pool);
+	rte_mempool_free(t->ca_vector_pool);
 }
 
 int
diff --git a/app/test-eventdev/test_perf_common.h b/app/test-eventdev/test_perf_common.h
index 503b6aa1db..faedd471c6 100644
--- a/app/test-eventdev/test_perf_common.h
+++ b/app/test-eventdev/test_perf_common.h
@@ -71,6 +71,7 @@  struct test_perf {
 	struct rte_mempool *ca_op_pool;
 	struct rte_mempool *ca_sess_pool;
 	struct rte_mempool *ca_asym_sess_pool;
+	struct rte_mempool *ca_vector_pool;
 } __rte_cache_aligned;
 
 struct perf_elt {
@@ -103,6 +104,8 @@  struct perf_elt {
 	uint8_t cnt = 0;\
 	void *bufs[16] __rte_cache_aligned;\
 	int const sz = RTE_DIM(bufs);\
+	uint8_t stage;\
+	struct perf_elt *pe = NULL;\
 	if (opt->verbose_level > 1)\
 		printf("%s(): lcore %d dev_id %d port=%d\n", __func__,\
 				rte_lcore_id(), dev, port)
@@ -143,6 +146,64 @@  perf_handle_crypto_ev(struct rte_event *ev, struct perf_elt **pe, int enable_fwd
 	return 0;
 }
 
+static __rte_always_inline struct perf_elt *
+perf_elt_from_vec_get(struct rte_event_vector *vec)
+{
+	/* Timestamp for vector event stored in first element */
+	struct rte_crypto_op *cop = vec->ptrs[0];
+	struct rte_mbuf *m;
+
+	if (cop->type == RTE_CRYPTO_OP_TYPE_SYMMETRIC) {
+		m = cop->sym->m_dst == NULL ? cop->sym->m_src : cop->sym->m_dst;
+		return rte_pktmbuf_mtod(m, struct perf_elt *);
+	} else {
+		return RTE_PTR_ADD(cop->asym->modex.result.data, cop->asym->modex.result.length);
+	}
+}
+
+static __rte_always_inline int
+perf_handle_crypto_vector_ev(struct rte_event *ev, struct perf_elt **pe,
+		const int enable_fwd_latency)
+{
+	struct rte_event_vector *vec = ev->vec;
+	struct rte_crypto_op *cop;
+	struct rte_mbuf *m;
+	int i, n = 0;
+	void *data;
+
+	for (i = 0; i < vec->nb_elem; i++) {
+		cop = vec->ptrs[i];
+		if (unlikely(cop->status != RTE_CRYPTO_OP_STATUS_SUCCESS)) {
+			if (cop->type == RTE_CRYPTO_OP_TYPE_SYMMETRIC) {
+				m = cop->sym->m_dst == NULL ? cop->sym->m_src : cop->sym->m_dst;
+				rte_pktmbuf_free(m);
+			} else {
+				data = cop->asym->modex.result.data;
+				rte_mempool_put(rte_mempool_from_obj(data), data);
+			}
+			rte_crypto_op_free(cop);
+			continue;
+		}
+		vec->ptrs[n++] = cop;
+	}
+
+	/* All cops failed, free the vector */
+	if (n == 0) {
+		rte_mempool_put(rte_mempool_from_obj(vec), vec);
+		return -ENOENT;
+	}
+
+	vec->nb_elem = n;
+
+	/* Forward latency not enabled - perf data will be not accessed */
+	if (!enable_fwd_latency)
+		return 0;
+
+	/* Get pointer to perf data */
+	*pe = perf_elt_from_vec_get(vec);
+
+	return 0;
+}
 
 static __rte_always_inline int
 perf_process_last_stage(struct rte_mempool *const pool, uint8_t prod_crypto_type,
@@ -195,9 +256,8 @@  perf_process_last_stage_latency(struct rte_mempool *const pool, uint8_t prod_cry
 	struct perf_elt *pe;
 	void *to_free_in_bulk;
 
-	/* release fence here ensures event_prt is
-	 * stored before updating the number of
-	 * processed packets for worker lcores
+	/* Release fence here ensures event_prt is stored before updating the number of processed
+	 * packets for worker lcores.
 	 */
 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
 	w->processed_pkts++;
@@ -237,6 +297,42 @@  perf_process_last_stage_latency(struct rte_mempool *const pool, uint8_t prod_cry
 	return count;
 }
 
+static __rte_always_inline void
+perf_process_vector_last_stage(struct rte_mempool *const pool,
+		struct rte_mempool *const ca_pool, struct rte_event *const ev,
+		struct worker_data *const w, const bool enable_fwd_latency)
+{
+	struct rte_event_vector *vec = ev->vec;
+	struct rte_crypto_op *cop;
+	void *bufs[vec->nb_elem];
+	struct perf_elt *pe;
+	uint64_t latency;
+	int i;
+
+	/* Release fence here ensures event_prt is stored before updating the number of processed
+	 * packets for worker lcores.
+	 */
+	rte_atomic_thread_fence(__ATOMIC_RELEASE);
+	w->processed_pkts += vec->nb_elem;
+
+	if (enable_fwd_latency) {
+		pe = perf_elt_from_vec_get(vec);
+		latency = rte_get_timer_cycles() - pe->timestamp;
+		w->latency += latency;
+	}
+
+	for (i = 0; i < vec->nb_elem; i++) {
+		cop = vec->ptrs[i];
+		if (cop->type == RTE_CRYPTO_OP_TYPE_SYMMETRIC)
+			bufs[i] = cop->sym->m_dst == NULL ? cop->sym->m_src : cop->sym->m_dst;
+		else
+			bufs[i] = cop->asym->modex.result.data;
+	}
+
+	rte_mempool_put_bulk(pool, bufs, vec->nb_elem);
+	rte_mempool_put_bulk(ca_pool, (void * const *)vec->ptrs, vec->nb_elem);
+	rte_mempool_put(rte_mempool_from_obj(vec), vec);
+}
 
 static inline int
 perf_nb_event_ports(struct evt_options *opt)
diff --git a/app/test-eventdev/test_perf_queue.c b/app/test-eventdev/test_perf_queue.c
index 69ef0ebbac..2399cfb69b 100644
--- a/app/test-eventdev/test_perf_queue.c
+++ b/app/test-eventdev/test_perf_queue.c
@@ -25,15 +25,22 @@  fwd_event(struct rte_event *const ev, uint8_t *const sched_type_list,
 	ev->event_type = RTE_EVENT_TYPE_CPU;
 }
 
+static __rte_always_inline void
+fwd_event_vector(struct rte_event *const ev, uint8_t *const sched_type_list,
+		const uint8_t nb_stages)
+{
+	ev->queue_id++;
+	ev->sched_type = sched_type_list[ev->queue_id % nb_stages];
+	ev->op = RTE_EVENT_OP_FORWARD;
+	ev->event_type = RTE_EVENT_TYPE_CPU_VECTOR;
+}
+
 static int
 perf_queue_worker(void *arg, const int enable_fwd_latency)
 {
-	struct perf_elt *pe = NULL;
 	uint16_t enq = 0, deq = 0;
 	struct rte_event ev;
 	PERF_WORKER_INIT;
-	uint8_t stage;
-
 
 	while (t->done == false) {
 		deq = rte_event_dequeue_burst(dev, port, &ev, 1, 0);
@@ -82,9 +89,7 @@  perf_queue_worker_burst(void *arg, const int enable_fwd_latency)
 	/* +1 to avoid prefetch out of array check */
 	struct rte_event ev[BURST_SIZE + 1];
 	uint16_t enq = 0, nb_rx = 0;
-	struct perf_elt *pe = NULL;
 	PERF_WORKER_INIT;
-	uint8_t stage;
 	uint16_t i;
 
 	while (t->done == false) {
@@ -137,6 +142,50 @@  perf_queue_worker_burst(void *arg, const int enable_fwd_latency)
 	return 0;
 }
 
+static int
+perf_queue_worker_vector(void *arg, const int enable_fwd_latency)
+{
+	uint16_t enq = 0, deq = 0;
+	struct rte_event ev;
+	PERF_WORKER_INIT;
+
+	RTE_SET_USED(sz);
+	RTE_SET_USED(cnt);
+	RTE_SET_USED(prod_crypto_type);
+
+	while (t->done == false) {
+		deq = rte_event_dequeue_burst(dev, port, &ev, 1, 0);
+
+		if (!deq)
+			continue;
+
+		if (ev.event_type == RTE_EVENT_TYPE_CRYPTODEV_VECTOR) {
+			if (perf_handle_crypto_vector_ev(&ev, &pe, enable_fwd_latency))
+				continue;
+		}
+
+		stage = ev.queue_id % nb_stages;
+		/* First q in pipeline, mark timestamp to compute fwd latency */
+		if (enable_fwd_latency && !prod_timer_type && stage == 0)
+			perf_mark_fwd_latency(pe);
+
+		/* Last stage in pipeline */
+		if (unlikely(stage == laststage)) {
+			perf_process_vector_last_stage(pool, t->ca_op_pool, &ev, w,
+						       enable_fwd_latency);
+		} else {
+			fwd_event_vector(&ev, sched_type_list, nb_stages);
+			do {
+				enq = rte_event_enqueue_burst(dev, port, &ev, 1);
+			} while (!enq && !t->done);
+		}
+	}
+
+	perf_worker_cleanup(pool, dev, port, &ev, enq, deq);
+
+	return 0;
+}
+
 static int
 worker_wrapper(void *arg)
 {
@@ -147,7 +196,9 @@  worker_wrapper(void *arg)
 	const int fwd_latency = opt->fwd_latency;
 
 	/* allow compiler to optimize */
-	if (!burst && !fwd_latency)
+	if (opt->ena_vector && opt->prod_type == EVT_PROD_TYPE_EVENT_CRYPTO_ADPTR)
+		return perf_queue_worker_vector(arg, fwd_latency);
+	else if (!burst && !fwd_latency)
 		return perf_queue_worker(arg, 0);
 	else if (!burst && fwd_latency)
 		return perf_queue_worker(arg, 1);
diff --git a/doc/guides/tools/testeventdev.rst b/doc/guides/tools/testeventdev.rst
index cd278e8998..6f065b9752 100644
--- a/doc/guides/tools/testeventdev.rst
+++ b/doc/guides/tools/testeventdev.rst
@@ -185,18 +185,18 @@  The following are the application command-line options:
 
 * ``--enable_vector``
 
-       Enable event vector for Rx/Tx adapters.
-       Only applicable for `pipeline_atq` and `pipeline_queue` tests.
+       Enable event vector for Rx/Tx/crypto adapters.
+       Only applicable for `pipeline_*` and `perf_*` tests.
 
 * ``--vector_size``
 
-       Vector size to configure for the Rx adapter.
-       Only applicable for `pipeline_atq` and `pipeline_queue` tests.
+       Vector size to configure for the Rx/crypto adapter.
+       Only applicable for `pipeline_*` and `perf_*` tests.
 
 * ``--vector_tmo_ns``
 
-       Vector timeout nanoseconds to be configured for the Rx adapter.
-       Only applicable for `pipeline_atq` and `pipeline_queue` tests.
+       Vector timeout nanoseconds to be configured for the Rx/crypto adapter.
+       Only applicable for `pipeline_*` and `perf_*` tests.
 
 * ``--per_port_pool``