[dpdk-dev,05/17] librte_acl: introduce DFA nodes compression (group64) for identical entries.

Message ID 1418580659-12595-6-git-send-email-konstantin.ananyev@intel.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Ananyev, Konstantin Dec. 14, 2014, 6:10 p.m. UTC
Introduced division of whole 256 child transition enties
into 4 sub-groups (64 kids per group).
So 2 groups within the same node with identical children,
can use one set of transition entries.
That allows to compact some DFA nodes and get space savings in the RT table,
without any negative performance impact.
From what I've seen an average space savings: ~20%.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 lib/librte_acl/acl.h            |  12 ++-
 lib/librte_acl/acl_gen.c        | 195 ++++++++++++++++++++++++++++------------
 lib/librte_acl/acl_run_scalar.c |  38 ++++----
 lib/librte_acl/acl_run_sse.c    |  99 ++++++--------------
 4 files changed, 196 insertions(+), 148 deletions(-)
  

Patch

diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h
index 102fa51..3f6ac79 100644
--- a/lib/librte_acl/acl.h
+++ b/lib/librte_acl/acl.h
@@ -47,6 +47,11 @@  extern"C" {
 #define RTE_ACL_DFA_MAX		UINT8_MAX
 #define RTE_ACL_DFA_SIZE	(UINT8_MAX + 1)
 
+#define	RTE_ACL_DFA_GR64_SIZE	64
+#define	RTE_ACL_DFA_GR64_NUM	(RTE_ACL_DFA_SIZE / RTE_ACL_DFA_GR64_SIZE)
+#define	RTE_ACL_DFA_GR64_BIT	\
+	(CHAR_BIT * sizeof(uint32_t) / RTE_ACL_DFA_GR64_NUM)
+
 typedef int bits_t;
 
 #define	RTE_ACL_BIT_SET_SIZE	((UINT8_MAX + 1) / (sizeof(bits_t) * CHAR_BIT))
@@ -100,8 +105,11 @@  struct rte_acl_node {
 	/* number of ranges (transitions w/ consecutive bits) */
 	int32_t                 id;
 	struct rte_acl_match_results *mrt; /* only valid when match_flag != 0 */
-	char                         transitions[RTE_ACL_QUAD_SIZE];
-	/* boundaries for ranged node */
+	union {
+		char            transitions[RTE_ACL_QUAD_SIZE];
+		/* boundaries for ranged node */
+		uint8_t         dfa_gr64[RTE_ACL_DFA_GR64_NUM];
+	};
 	struct rte_acl_node     *next;
 	/* free list link or pointer to duplicate node during merge */
 	struct rte_acl_node     *prev;
diff --git a/lib/librte_acl/acl_gen.c b/lib/librte_acl/acl_gen.c
index b1f766b..c9b7839 100644
--- a/lib/librte_acl/acl_gen.c
+++ b/lib/librte_acl/acl_gen.c
@@ -43,13 +43,14 @@ 
 } while (0)
 
 struct acl_node_counters {
-	int                match;
-	int                match_used;
-	int                single;
-	int                quad;
-	int                quad_vectors;
-	int                dfa;
-	int                smallest_match;
+	int32_t match;
+	int32_t match_used;
+	int32_t single;
+	int32_t quad;
+	int32_t quad_vectors;
+	int32_t dfa;
+	int32_t dfa_gr64;
+	int32_t smallest_match;
 };
 
 struct rte_acl_indices {
@@ -61,24 +62,118 @@  struct rte_acl_indices {
 
 static void
 acl_gen_log_stats(const struct rte_acl_ctx *ctx,
-	const struct acl_node_counters *counts)
+	const struct acl_node_counters *counts,
+	const struct rte_acl_indices *indices)
 {
 	RTE_LOG(DEBUG, ACL, "Gen phase for ACL \"%s\":\n"
 		"runtime memory footprint on socket %d:\n"
 		"single nodes/bytes used: %d/%zu\n"
-		"quad nodes/bytes used: %d/%zu\n"
-		"DFA nodes/bytes used: %d/%zu\n"
+		"quad nodes/vectors/bytes used: %d/%d/%zu\n"
+		"DFA nodes/group64/bytes used: %d/%d/%zu\n"
 		"match nodes/bytes used: %d/%zu\n"
 		"total: %zu bytes\n",
 		ctx->name, ctx->socket_id,
 		counts->single, counts->single * sizeof(uint64_t),
-		counts->quad, counts->quad_vectors * sizeof(uint64_t),
-		counts->dfa, counts->dfa * RTE_ACL_DFA_SIZE * sizeof(uint64_t),
+		counts->quad, counts->quad_vectors,
+		(indices->quad_index - indices->dfa_index) * sizeof(uint64_t),
+		counts->dfa, counts->dfa_gr64,
+		indices->dfa_index * sizeof(uint64_t),
 		counts->match,
 		counts->match * sizeof(struct rte_acl_match_results),
 		ctx->mem_sz);
 }
 
+static uint64_t
+acl_dfa_gen_idx(const struct rte_acl_node *node, uint32_t index)
+{
+	uint64_t idx;
+	uint32_t i;
+
+	idx = 0;
+	for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) {
+		RTE_ACL_VERIFY(node->dfa_gr64[i] < RTE_ACL_DFA_GR64_NUM);
+		RTE_ACL_VERIFY(node->dfa_gr64[i] < node->fanout);
+		idx |= (i - node->dfa_gr64[i]) <<
+			(6 + RTE_ACL_DFA_GR64_BIT * i);
+	}
+
+	return idx << (CHAR_BIT * sizeof(index)) | index | node->node_type;
+}
+
+static void
+acl_dfa_fill_gr64(const struct rte_acl_node *node,
+	const uint64_t src[RTE_ACL_DFA_SIZE], uint64_t dst[RTE_ACL_DFA_SIZE])
+{
+	uint32_t i;
+
+	for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) {
+		memcpy(dst + node->dfa_gr64[i] * RTE_ACL_DFA_GR64_SIZE,
+			src + i * RTE_ACL_DFA_GR64_SIZE,
+			RTE_ACL_DFA_GR64_SIZE * sizeof(dst[0]));
+	}
+}
+
+static uint32_t
+acl_dfa_count_gr64(const uint64_t array_ptr[RTE_ACL_DFA_SIZE],
+	uint8_t gr64[RTE_ACL_DFA_GR64_NUM])
+{
+	uint32_t i, j, k;
+
+	k = 0;
+	for (i = 0; i != RTE_ACL_DFA_GR64_NUM; i++) {
+		gr64[i] = i;
+		for (j = 0; j != i; j++) {
+			if (memcmp(array_ptr + i * RTE_ACL_DFA_GR64_SIZE,
+					array_ptr + j * RTE_ACL_DFA_GR64_SIZE,
+					RTE_ACL_DFA_GR64_SIZE *
+					sizeof(array_ptr[0])) == 0)
+				break;
+		}
+		gr64[i] = (j != i) ? gr64[j] : k++;
+	}
+
+	return k;
+}
+
+static uint32_t
+acl_node_fill_dfa(const struct rte_acl_node *node,
+	uint64_t dfa[RTE_ACL_DFA_SIZE], uint64_t no_match, int32_t resolved)
+{
+	uint32_t n, x;
+	uint32_t ranges, last_bit;
+	struct rte_acl_node *child;
+	struct rte_acl_bitset *bits;
+
+	ranges = 0;
+	last_bit = 0;
+
+	for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
+		dfa[n] = no_match;
+
+	for (x = 0; x < node->num_ptrs; x++) {
+
+		child = node->ptrs[x].ptr;
+		if (child == NULL)
+			continue;
+
+		bits = &node->ptrs[x].values;
+		for (n = 0; n < RTE_ACL_DFA_SIZE; n++) {
+
+			if (bits->bits[n / (sizeof(bits_t) * CHAR_BIT)] &
+				(1 << (n % (sizeof(bits_t) * CHAR_BIT)))) {
+
+				dfa[n] = resolved ? child->node_index : x;
+				ranges += (last_bit == 0);
+				last_bit = 1;
+			} else {
+				last_bit = 0;
+			}
+		}
+	}
+
+	return ranges;
+}
+
 /*
 *  Counts the number of groups of sequential bits that are
 *  either 0 or 1, as specified by the zero_one parameter. This is used to
@@ -150,10 +245,11 @@  acl_count_fanout(struct rte_acl_node *node)
  */
 static int
 acl_count_trie_types(struct acl_node_counters *counts,
-	struct rte_acl_node *node, int match, int force_dfa)
+	struct rte_acl_node *node, uint64_t no_match, int match, int force_dfa)
 {
 	uint32_t n;
 	int num_ptrs;
+	uint64_t dfa[RTE_ACL_DFA_SIZE];
 
 	/* skip if this node has been counted */
 	if (node->node_type != (uint32_t)RTE_ACL_NODE_UNDEFINED)
@@ -186,6 +282,16 @@  acl_count_trie_types(struct acl_node_counters *counts,
 	} else {
 		counts->dfa++;
 		node->node_type = RTE_ACL_NODE_DFA;
+		if (force_dfa != 0) {
+			/* always expand to a max number of nodes. */
+			for (n = 0; n != RTE_DIM(node->dfa_gr64); n++)
+				node->dfa_gr64[n] = n;
+			node->fanout = n;
+		} else {
+			acl_node_fill_dfa(node, dfa, no_match, 0);
+			node->fanout = acl_dfa_count_gr64(dfa, node->dfa_gr64);
+		}
+		counts->dfa_gr64 += node->fanout;
 	}
 
 	/*
@@ -194,7 +300,7 @@  acl_count_trie_types(struct acl_node_counters *counts,
 	for (n = 0; n < node->num_ptrs; n++) {
 		if (node->ptrs[n].ptr != NULL)
 			match = acl_count_trie_types(counts, node->ptrs[n].ptr,
-				match, 0);
+				no_match, match, 0);
 	}
 
 	return match;
@@ -204,38 +310,11 @@  static void
 acl_add_ptrs(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match,
 	int resolved)
 {
-	uint32_t n, x;
-	int m, ranges, last_bit;
-	struct rte_acl_node *child;
-	struct rte_acl_bitset *bits;
+	uint32_t x;
+	int32_t m;
 	uint64_t *node_a, index, dfa[RTE_ACL_DFA_SIZE];
 
-	ranges = 0;
-	last_bit = 0;
-
-	for (n = 0; n < RTE_DIM(dfa); n++)
-		dfa[n] = no_match;
-
-	for (x = 0; x < node->num_ptrs; x++) {
-
-		child = node->ptrs[x].ptr;
-		if (child == NULL)
-			continue;
-
-		bits = &node->ptrs[x].values;
-		for (n = 0; n < RTE_DIM(dfa); n++) {
-
-			if (bits->bits[n / (sizeof(bits_t) * CHAR_BIT)] &
-				(1 << (n % (sizeof(bits_t) * CHAR_BIT)))) {
-
-				dfa[n] = resolved ? child->node_index : x;
-				ranges += (last_bit == 0);
-				last_bit = 1;
-			} else {
-				last_bit = 0;
-			}
-		}
-	}
+	acl_node_fill_dfa(node, dfa, no_match, resolved);
 
 	/*
 	 * Rather than going from 0 to 256, the range count and
@@ -272,8 +351,7 @@  acl_add_ptrs(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match,
 		RTE_ACL_VERIFY(m <= RTE_ACL_QUAD_SIZE);
 
 	} else if (node->node_type == RTE_ACL_NODE_DFA && resolved) {
-		for (n = 0; n < RTE_DIM(dfa); n++)
-			node_array[n] = dfa[n];
+		acl_dfa_fill_gr64(node, dfa, node_array);
 	}
 }
 
@@ -286,7 +364,7 @@  static void
 acl_gen_node(struct rte_acl_node *node, uint64_t *node_array,
 	uint64_t no_match, struct rte_acl_indices *index, int num_categories)
 {
-	uint32_t n, *qtrp;
+	uint32_t n, sz, *qtrp;
 	uint64_t *array_ptr;
 	struct rte_acl_match_results *match;
 
@@ -297,10 +375,11 @@  acl_gen_node(struct rte_acl_node *node, uint64_t *node_array,
 
 	switch (node->node_type) {
 	case RTE_ACL_NODE_DFA:
-		node->node_index = index->dfa_index | node->node_type;
 		array_ptr = &node_array[index->dfa_index];
-		index->dfa_index += RTE_ACL_DFA_SIZE;
-		for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
+		node->node_index = acl_dfa_gen_idx(node, index->dfa_index);
+		sz = node->fanout * RTE_ACL_DFA_GR64_SIZE;
+		index->dfa_index += sz;
+		for (n = 0; n < sz; n++)
 			array_ptr[n] = no_match;
 		break;
 	case RTE_ACL_NODE_SINGLE:
@@ -312,7 +391,7 @@  acl_gen_node(struct rte_acl_node *node, uint64_t *node_array,
 		break;
 	case RTE_ACL_NODE_QRANGE:
 		array_ptr = &node_array[index->quad_index];
-		acl_add_ptrs(node, array_ptr, no_match,  0);
+		acl_add_ptrs(node, array_ptr, no_match, 0);
 		qtrp = (uint32_t *)node->transitions;
 		node->node_index = qtrp[0];
 		node->node_index <<= sizeof(index->quad_index) * CHAR_BIT;
@@ -368,7 +447,7 @@  static int
 acl_calc_counts_indices(struct acl_node_counters *counts,
 	struct rte_acl_indices *indices, struct rte_acl_trie *trie,
 	struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries,
-	int match_num)
+	int match_num, uint64_t no_match)
 {
 	uint32_t n;
 
@@ -379,13 +458,13 @@  acl_calc_counts_indices(struct acl_node_counters *counts,
 	for (n = 0; n < num_tries; n++) {
 		counts->smallest_match = INT32_MAX;
 		match_num = acl_count_trie_types(counts, node_bld_trie[n].trie,
-			match_num, 1);
+			no_match, match_num, 1);
 		trie[n].smallest = counts->smallest_match;
 	}
 
 	indices->dfa_index = RTE_ACL_DFA_SIZE + 1;
 	indices->quad_index = indices->dfa_index +
-		counts->dfa * RTE_ACL_DFA_SIZE;
+		counts->dfa_gr64 * RTE_ACL_DFA_GR64_SIZE;
 	indices->single_index = indices->quad_index + counts->quad_vectors;
 	indices->match_index = indices->single_index + counts->single + 1;
 	indices->match_index = RTE_ALIGN(indices->match_index,
@@ -410,9 +489,11 @@  rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,
 	struct acl_node_counters counts;
 	struct rte_acl_indices indices;
 
+	no_match = RTE_ACL_NODE_MATCH;
+
 	/* Fill counts and indices arrays from the nodes. */
 	match_num = acl_calc_counts_indices(&counts, &indices, trie,
-		node_bld_trie, num_tries, match_num);
+		node_bld_trie, num_tries, match_num, no_match);
 
 	/* Allocate runtime memory (align to cache boundary) */
 	total_size = RTE_ALIGN(data_index_sz, RTE_CACHE_LINE_SIZE) +
@@ -440,11 +521,11 @@  rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,
 	 */
 
 	node_array[RTE_ACL_DFA_SIZE] = RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE;
-	no_match = RTE_ACL_NODE_MATCH;
 
 	for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
 		node_array[n] = no_match;
 
+	/* NOMATCH result at index 0 */
 	match = ((struct rte_acl_match_results *)(node_array + match_index));
 	memset(match, 0, sizeof(*match));
 
@@ -470,6 +551,6 @@  rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,
 	ctx->trans_table = node_array;
 	memcpy(ctx->trie, trie, sizeof(ctx->trie));
 
-	acl_gen_log_stats(ctx, &counts);
+	acl_gen_log_stats(ctx, &counts, &indices);
 	return 0;
 }
diff --git a/lib/librte_acl/acl_run_scalar.c b/lib/librte_acl/acl_run_scalar.c
index 43c8fc3..40691ce 100644
--- a/lib/librte_acl/acl_run_scalar.c
+++ b/lib/librte_acl/acl_run_scalar.c
@@ -94,15 +94,6 @@  resolve_priority_scalar(uint64_t transition, int n,
 	}
 }
 
-/*
- * When processing the transition, rather than using if/else
- * construct, the offset is calculated for DFA and QRANGE and
- * then conditionally added to the address based on node type.
- * This is done to avoid branch mis-predictions. Since the
- * offset is rather simple calculation it is more efficient
- * to do the calculation and do a condition move rather than
- * a conditional branch to determine which calculation to do.
- */
 static inline uint32_t
 scan_forward(uint32_t input, uint32_t max)
 {
@@ -117,18 +108,27 @@  scalar_transition(const uint64_t *trans_table, uint64_t transition,
 
 	/* break transition into component parts */
 	ranges = transition >> (sizeof(index) * CHAR_BIT);
-
-	/* calc address for a QRANGE node */
-	c = input * SCALAR_QRANGE_MULT;
-	a = ranges | SCALAR_QRANGE_MIN;
 	index = transition & ~RTE_ACL_NODE_INDEX;
-	a -= (c & SCALAR_QRANGE_MASK);
-	b = c & SCALAR_QRANGE_MIN;
 	addr = transition ^ index;
-	a &= SCALAR_QRANGE_MIN;
-	a ^= (ranges ^ b) & (a ^ b);
-	x = scan_forward(a, 32) >> 3;
-	addr += (index == RTE_ACL_NODE_DFA) ? input : x;
+
+	if (index != RTE_ACL_NODE_DFA) {
+		/* calc address for a QRANGE/SINGLE node */
+		c = (uint32_t)input * SCALAR_QRANGE_MULT;
+		a = ranges | SCALAR_QRANGE_MIN;
+		a -= (c & SCALAR_QRANGE_MASK);
+		b = c & SCALAR_QRANGE_MIN;
+		a &= SCALAR_QRANGE_MIN;
+		a ^= (ranges ^ b) & (a ^ b);
+		x = scan_forward(a, 32) >> 3;
+	} else {
+		/* calc address for a DFA node */
+		x = ranges >> (input /
+			RTE_ACL_DFA_GR64_SIZE * RTE_ACL_DFA_GR64_BIT);
+		x &= UINT8_MAX;
+		x = input - x;
+	}
+
+	addr += x;
 
 	/* pickup next transition */
 	transition = *(trans_table + addr);
diff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c
index 69a9d77..576c92b 100644
--- a/lib/librte_acl/acl_run_sse.c
+++ b/lib/librte_acl/acl_run_sse.c
@@ -40,24 +40,6 @@  enum {
 	SHUFFLE32_SWAP64 = 0x4e,
 };
 
-static const rte_xmm_t mm_type_quad_range = {
-	.u32 = {
-		RTE_ACL_NODE_QRANGE,
-		RTE_ACL_NODE_QRANGE,
-		RTE_ACL_NODE_QRANGE,
-		RTE_ACL_NODE_QRANGE,
-	},
-};
-
-static const rte_xmm_t mm_type_quad_range64 = {
-	.u32 = {
-		RTE_ACL_NODE_QRANGE,
-		RTE_ACL_NODE_QRANGE,
-		0,
-		0,
-	},
-};
-
 static const rte_xmm_t mm_shuffle_input = {
 	.u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c},
 };
@@ -70,14 +52,6 @@  static const rte_xmm_t mm_ones_16 = {
 	.u16 = {1, 1, 1, 1, 1, 1, 1, 1},
 };
 
-static const rte_xmm_t mm_bytes = {
-	.u32 = {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX},
-};
-
-static const rte_xmm_t mm_bytes64 = {
-	.u32 = {UINT8_MAX, UINT8_MAX, 0, 0},
-};
-
 static const rte_xmm_t mm_match_mask = {
 	.u32 = {
 		RTE_ACL_NODE_MATCH,
@@ -236,10 +210,14 @@  acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
  */
 static inline xmm_t
 acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
-	xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
-	xmm_t *indices1, xmm_t *indices2)
+	xmm_t ones_16, xmm_t indices1, xmm_t indices2)
 {
-	xmm_t addr, node_types, temp;
+	xmm_t addr, node_types, range, temp;
+	xmm_t dfa_msk, dfa_ofs, quad_ofs;
+	xmm_t in, r, t;
+
+	const xmm_t range_base = _mm_set_epi32(0xffffff0c, 0xffffff08,
+		0xffffff04, 0xffffff00);
 
 	/*
 	 * Note that no transition is done for a match
@@ -248,10 +226,13 @@  acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 	 */
 
 	/* Shuffle low 32 into temp and high 32 into indices2 */
-	temp = (xmm_t)MM_SHUFFLEPS((__m128)*indices1, (__m128)*indices2,
-		0x88);
-	*indices2 = (xmm_t)MM_SHUFFLEPS((__m128)*indices1,
-		(__m128)*indices2, 0xdd);
+	temp = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0x88);
+	range = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0xdd);
+
+	t = MM_XOR(index_mask, index_mask);
+
+	/* shuffle input byte to all 4 positions of 32 bit value */
+	in = MM_SHUFFLE8(next_input, shuffle_input);
 
 	/* Calc node type and node addr */
 	node_types = MM_ANDNOT(index_mask, temp);
@@ -262,17 +243,15 @@  acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 	 */
 
 	/* mask for DFA type (0) nodes */
-	temp = MM_CMPEQ32(node_types, MM_XOR(node_types, node_types));
+	dfa_msk = MM_CMPEQ32(node_types, t);
 
-	/* add input byte to DFA position */
-	temp = MM_AND(temp, bytes);
-	temp = MM_AND(temp, next_input);
-	addr = MM_ADD32(addr, temp);
+	r = _mm_srli_epi32(in, 30);
+	r = _mm_add_epi8(r, range_base);
 
-	/*
-	 * Calc addr for Range nodes -> range_index + range(input)
-	 */
-	node_types = MM_CMPEQ32(node_types, type_quad_range);
+	t = _mm_srli_epi32(in, 24);
+	r = _mm_shuffle_epi8(range, r);
+
+	dfa_ofs = _mm_sub_epi32(t, r);
 
 	/*
 	 * Calculate number of range boundaries that are less than the
@@ -282,11 +261,8 @@  acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 	 * input byte.
 	 */
 
-	/* shuffle input byte to all 4 positions of 32 bit value */
-	temp = MM_SHUFFLE8(next_input, shuffle_input);
-
 	/* check ranges */
-	temp = MM_CMPGT8(temp, *indices2);
+	temp = MM_CMPGT8(in, range);
 
 	/* convert -1 to 1 (bytes greater than input byte */
 	temp = MM_SIGN8(temp, temp);
@@ -295,10 +271,10 @@  acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 	temp = MM_MADD8(temp, temp);
 
 	/* horizontal add pairs of words into dwords */
-	temp = MM_MADD16(temp, ones_16);
+	quad_ofs = MM_MADD16(temp, ones_16);
 
 	/* mask to range type nodes */
-	temp = MM_AND(temp, node_types);
+	temp = _mm_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);
 
 	/* add index into node position */
 	return MM_ADD32(addr, temp);
@@ -309,8 +285,8 @@  acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
  */
 static inline xmm_t
 transition4(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
-	xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
-	const uint64_t *trans, xmm_t *indices1, xmm_t *indices2)
+	xmm_t ones_16, const uint64_t *trans,
+	xmm_t *indices1, xmm_t *indices2)
 {
 	xmm_t addr;
 	uint64_t trans0, trans2;
@@ -318,7 +294,7 @@  transition4(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 	 /* Calculate the address (array index) for all 4 transitions. */
 
 	addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,
-		bytes, type_quad_range, indices1, indices2);
+		*indices1, *indices2);
 
 	 /* Gather 64 bit transitions and pack back into 2 registers. */
 
@@ -408,42 +384,34 @@  search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
 		input0 = transition4(mm_index_mask.m, input0,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices1, &indices2);
 
 		input1 = transition4(mm_index_mask.m, input1,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices3, &indices4);
 
 		input0 = transition4(mm_index_mask.m, input0,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices1, &indices2);
 
 		input1 = transition4(mm_index_mask.m, input1,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices3, &indices4);
 
 		input0 = transition4(mm_index_mask.m, input0,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices1, &indices2);
 
 		input1 = transition4(mm_index_mask.m, input1,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices3, &indices4);
 
 		input0 = transition4(mm_index_mask.m, input0,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices1, &indices2);
 
 		input1 = transition4(mm_index_mask.m, input1,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices3, &indices4);
 
 		 /* Check for any matches. */
@@ -496,22 +464,18 @@  search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
 		/* Process the 4 bytes of input on each stream. */
 		input = transition4(mm_index_mask.m, input,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices1, &indices2);
 
 		 input = transition4(mm_index_mask.m, input,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices1, &indices2);
 
 		 input = transition4(mm_index_mask.m, input,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices1, &indices2);
 
 		 input = transition4(mm_index_mask.m, input,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices1, &indices2);
 
 		/* Check for any matches. */
@@ -524,8 +488,7 @@  search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
 static inline xmm_t
 transition2(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
-	xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
-	const uint64_t *trans, xmm_t *indices1)
+	xmm_t ones_16, const uint64_t *trans, xmm_t *indices1)
 {
 	uint64_t t;
 	xmm_t addr, indices2;
@@ -533,7 +496,7 @@  transition2(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 	indices2 = MM_XOR(ones_16, ones_16);
 
 	addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,
-		bytes, type_quad_range, indices1, &indices2);
+		*indices1, indices2);
 
 	/* Gather 64 bit transitions and pack 2 per register. */
 
@@ -583,22 +546,18 @@  search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
 		input = transition2(mm_index_mask64.m, input,
 			mm_shuffle_input64.m, mm_ones_16.m,
-			mm_bytes64.m, mm_type_quad_range64.m,
 			flows.trans, &indices);
 
 		input = transition2(mm_index_mask64.m, input,
 			mm_shuffle_input64.m, mm_ones_16.m,
-			mm_bytes64.m, mm_type_quad_range64.m,
 			flows.trans, &indices);
 
 		input = transition2(mm_index_mask64.m, input,
 			mm_shuffle_input64.m, mm_ones_16.m,
-			mm_bytes64.m, mm_type_quad_range64.m,
 			flows.trans, &indices);
 
 		input = transition2(mm_index_mask64.m, input,
 			mm_shuffle_input64.m, mm_ones_16.m,
-			mm_bytes64.m, mm_type_quad_range64.m,
 			flows.trans, &indices);
 
 		/* Check for any matches. */