[dpdk-dev,12/17] librte_acl: Remove search_sse_2 and relatives.

Message ID 1418580659-12595-13-git-send-email-konstantin.ananyev@intel.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Ananyev, Konstantin Dec. 14, 2014, 6:10 p.m. UTC
Previous improvements made scalar method the fastest one
for tiny bunch of packets (< 4).
That allows us to remove specific vector code-path for small number of packets
(search_sse_2)
and always use scalar method for such cases.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 lib/librte_acl/acl_run_avx2.c |   2 +-
 lib/librte_acl/acl_run_sse.c  |   3 +-
 lib/librte_acl/acl_run_sse.h  | 110 ------------------------------------------
 3 files changed, 3 insertions(+), 112 deletions(-)
  

Patch

diff --git a/lib/librte_acl/acl_run_avx2.c b/lib/librte_acl/acl_run_avx2.c
index 8419d5d..a717c27 100644
--- a/lib/librte_acl/acl_run_avx2.c
+++ b/lib/librte_acl/acl_run_avx2.c
@@ -53,6 +53,6 @@  rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	else if (num >= MAX_SEARCHES_SSE4)
 		return search_sse_4(ctx, data, results, num, categories);
 	else
-		return search_sse_2(ctx, data, results, num,
+		return rte_acl_classify_scalar(ctx, data, results, num,
 			categories);
 }
diff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c
index 77b32b3..a5a7d36 100644
--- a/lib/librte_acl/acl_run_sse.c
+++ b/lib/librte_acl/acl_run_sse.c
@@ -42,5 +42,6 @@  rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	else if (num >= MAX_SEARCHES_SSE4)
 		return search_sse_4(ctx, data, results, num, categories);
 	else
-		return search_sse_2(ctx, data, results, num, categories);
+		return rte_acl_classify_scalar(ctx, data, results, num,
+			categories);
 }
diff --git a/lib/librte_acl/acl_run_sse.h b/lib/librte_acl/acl_run_sse.h
index e33e16b..1b7870e 100644
--- a/lib/librte_acl/acl_run_sse.h
+++ b/lib/librte_acl/acl_run_sse.h
@@ -45,10 +45,6 @@  static const rte_xmm_t xmm_shuffle_input = {
 	.u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c},
 };
 
-static const rte_xmm_t xmm_shuffle_input64 = {
-	.u32 = {0x00000000, 0x04040404, 0x80808080, 0x80808080},
-};
-
 static const rte_xmm_t xmm_ones_16 = {
 	.u16 = {1, 1, 1, 1, 1, 1, 1, 1},
 };
@@ -62,15 +58,6 @@  static const rte_xmm_t xmm_match_mask = {
 	},
 };
 
-static const rte_xmm_t xmm_match_mask64 = {
-	.u32 = {
-		RTE_ACL_NODE_MATCH,
-		0,
-		RTE_ACL_NODE_MATCH,
-		0,
-	},
-};
-
 static const rte_xmm_t xmm_index_mask = {
 	.u32 = {
 		RTE_ACL_NODE_INDEX,
@@ -80,16 +67,6 @@  static const rte_xmm_t xmm_index_mask = {
 	},
 };
 
-static const rte_xmm_t xmm_index_mask64 = {
-	.u32 = {
-		RTE_ACL_NODE_INDEX,
-		RTE_ACL_NODE_INDEX,
-		0,
-		0,
-	},
-};
-
-
 /*
  * Resolve priority for multiple results (sse version).
  * This consists comparing the priority of the current traversal with the
@@ -161,22 +138,6 @@  acl_process_matches(xmm_t *indices, int slot, const struct rte_acl_ctx *ctx,
 }
 
 /*
- * Check for a match in 2 transitions (contained in SSE register)
- */
-static inline __attribute__((always_inline)) void
-acl_match_check_x2(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
-	struct acl_flow_data *flows, xmm_t *indices, xmm_t match_mask)
-{
-	xmm_t temp;
-
-	temp = MM_AND(match_mask, *indices);
-	while (!MM_TESTZ(temp, temp)) {
-		acl_process_matches(indices, slot, ctx, parms, flows);
-		temp = MM_AND(match_mask, *indices);
-	}
-}
-
-/*
  * Check for any match in 4 transitions (contained in 2 SSE registers)
  */
 static inline __attribute__((always_inline)) void
@@ -460,74 +421,3 @@  search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
 	return 0;
 }
-
-static inline __attribute__((always_inline)) xmm_t
-transition2(xmm_t next_input, const uint64_t *trans, xmm_t *indices1)
-{
-	uint64_t t;
-	xmm_t addr, indices2;
-
-	indices2 = _mm_setzero_si128();
-
-	addr = calc_addr_sse(xmm_index_mask.x, next_input, xmm_shuffle_input.x,
-		xmm_ones_16.x, *indices1, indices2);
-
-	/* Gather 64 bit transitions and pack 2 per register. */
-
-	t = trans[MM_CVT32(addr)];
-
-	/* get slot 1 */
-	addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1);
-	*indices1 = MM_SET64(trans[MM_CVT32(addr)], t);
-
-	return MM_SRL32(next_input, CHAR_BIT);
-}
-
-/*
- * Execute trie traversal with 2 traversals in parallel.
- */
-static inline int
-search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data,
-	uint32_t *results, uint32_t total_packets, uint32_t categories)
-{
-	int n;
-	struct acl_flow_data flows;
-	uint64_t index_array[MAX_SEARCHES_SSE2];
-	struct completion cmplt[MAX_SEARCHES_SSE2];
-	struct parms parms[MAX_SEARCHES_SSE2];
-	xmm_t input, indices;
-
-	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
-		total_packets, categories, ctx->trans_table);
-
-	for (n = 0; n < MAX_SEARCHES_SSE2; n++) {
-		cmplt[n].count = 0;
-		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
-	}
-
-	indices = MM_LOADU((xmm_t *) &index_array[0]);
-
-	/* Check for any matches. */
-	acl_match_check_x2(0, ctx, parms, &flows, &indices,
-		xmm_match_mask64.x);
-
-	while (flows.started > 0) {
-
-		/* Gather 4 bytes of input data for each stream. */
-		input = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0));
-		input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
-
-		/* Process the 4 bytes of input on each stream. */
-
-		input = transition2(input, flows.trans, &indices);
-		input = transition2(input, flows.trans, &indices);
-		input = transition2(input, flows.trans, &indices);
-		input = transition2(input, flows.trans, &indices);
-
-		/* Check for any matches. */
-		acl_match_check_x2(0, ctx, parms, &flows, &indices,
-			xmm_match_mask64.x);
-	}
-
-	return 0;
-}