[dpdk-dev,09/17] EAL: introduce rte_ymm and relatives in rte_common_vect.h.
Commit Message
New data type to manipulate 256 bit AVX values.
Rename field in the rte_xmm to keep common naming accross SSE/AVX fields.
Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
examples/l3fwd/main.c | 2 +-
lib/librte_acl/acl_run_sse.c | 88 ++++++++++++-------------
lib/librte_eal/common/include/rte_common_vect.h | 27 +++++++-
lib/librte_lpm/rte_lpm.h | 2 +-
4 files changed, 71 insertions(+), 48 deletions(-)
Comments
On Sun, Dec 14, 2014 at 06:10:51PM +0000, Konstantin Ananyev wrote:
> New data type to manipulate 256 bit AVX values.
> Rename field in the rte_xmm to keep common naming accross SSE/AVX fields.
>
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> ---
> examples/l3fwd/main.c | 2 +-
> lib/librte_acl/acl_run_sse.c | 88 ++++++++++++-------------
> lib/librte_eal/common/include/rte_common_vect.h | 27 +++++++-
> lib/librte_lpm/rte_lpm.h | 2 +-
> 4 files changed, 71 insertions(+), 48 deletions(-)
>
> diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
> index bf0fcdb..dc6cae2 100644
> --- a/examples/l3fwd/main.c
> +++ b/examples/l3fwd/main.c
> @@ -1168,7 +1168,7 @@ processx4_step2(const struct lcore_conf *qconf, __m128i dip, uint32_t flag,
> if (likely(flag != 0)) {
> rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip, dprt, portid);
> } else {
> - dst.m = dip;
> + dst.x = dip;
> dprt[0] = get_dst_port(qconf, pkt[0], dst.u32[0], portid);
> dprt[1] = get_dst_port(qconf, pkt[1], dst.u32[1], portid);
> dprt[2] = get_dst_port(qconf, pkt[2], dst.u32[2], portid);
> diff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c
> index 09e32be..4605b58 100644
> --- a/lib/librte_acl/acl_run_sse.c
> +++ b/lib/librte_acl/acl_run_sse.c
> @@ -359,16 +359,16 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
>
> /* Check for any matches. */
> acl_match_check_x4(0, ctx, parms, &flows,
> - &indices1, &indices2, mm_match_mask.m);
> + &indices1, &indices2, mm_match_mask.x);
> acl_match_check_x4(4, ctx, parms, &flows,
> - &indices3, &indices4, mm_match_mask.m);
> + &indices3, &indices4, mm_match_mask.x);
>
> while (flows.started > 0) {
>
> /* Gather 4 bytes of input data for each stream. */
> - input0 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0),
> + input0 = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0),
> 0);
> - input1 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 4),
> + input1 = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 4),
> 0);
>
> input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 1), 1);
> @@ -382,43 +382,43 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
>
> /* Process the 4 bytes of input on each stream. */
>
> - input0 = transition4(mm_index_mask.m, input0,
> - mm_shuffle_input.m, mm_ones_16.m,
> + input0 = transition4(mm_index_mask.x, input0,
> + mm_shuffle_input.x, mm_ones_16.x,
> flows.trans, &indices1, &indices2);
>
> - input1 = transition4(mm_index_mask.m, input1,
> - mm_shuffle_input.m, mm_ones_16.m,
> + input1 = transition4(mm_index_mask.x, input1,
> + mm_shuffle_input.x, mm_ones_16.x,
> flows.trans, &indices3, &indices4);
>
> - input0 = transition4(mm_index_mask.m, input0,
> - mm_shuffle_input.m, mm_ones_16.m,
> + input0 = transition4(mm_index_mask.x, input0,
> + mm_shuffle_input.x, mm_ones_16.x,
> flows.trans, &indices1, &indices2);
>
> - input1 = transition4(mm_index_mask.m, input1,
> - mm_shuffle_input.m, mm_ones_16.m,
> + input1 = transition4(mm_index_mask.x, input1,
> + mm_shuffle_input.x, mm_ones_16.x,
> flows.trans, &indices3, &indices4);
>
> - input0 = transition4(mm_index_mask.m, input0,
> - mm_shuffle_input.m, mm_ones_16.m,
> + input0 = transition4(mm_index_mask.x, input0,
> + mm_shuffle_input.x, mm_ones_16.x,
> flows.trans, &indices1, &indices2);
>
> - input1 = transition4(mm_index_mask.m, input1,
> - mm_shuffle_input.m, mm_ones_16.m,
> + input1 = transition4(mm_index_mask.x, input1,
> + mm_shuffle_input.x, mm_ones_16.x,
> flows.trans, &indices3, &indices4);
>
> - input0 = transition4(mm_index_mask.m, input0,
> - mm_shuffle_input.m, mm_ones_16.m,
> + input0 = transition4(mm_index_mask.x, input0,
> + mm_shuffle_input.x, mm_ones_16.x,
> flows.trans, &indices1, &indices2);
>
> - input1 = transition4(mm_index_mask.m, input1,
> - mm_shuffle_input.m, mm_ones_16.m,
> + input1 = transition4(mm_index_mask.x, input1,
> + mm_shuffle_input.x, mm_ones_16.x,
> flows.trans, &indices3, &indices4);
>
> /* Check for any matches. */
> acl_match_check_x4(0, ctx, parms, &flows,
> - &indices1, &indices2, mm_match_mask.m);
> + &indices1, &indices2, mm_match_mask.x);
> acl_match_check_x4(4, ctx, parms, &flows,
> - &indices3, &indices4, mm_match_mask.m);
> + &indices3, &indices4, mm_match_mask.x);
> }
>
> return 0;
> @@ -451,36 +451,36 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
>
> /* Check for any matches. */
> acl_match_check_x4(0, ctx, parms, &flows,
> - &indices1, &indices2, mm_match_mask.m);
> + &indices1, &indices2, mm_match_mask.x);
>
> while (flows.started > 0) {
>
> /* Gather 4 bytes of input data for each stream. */
> - input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0);
> + input = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0), 0);
> input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
> input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 2), 2);
> input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 3), 3);
>
> /* Process the 4 bytes of input on each stream. */
> - input = transition4(mm_index_mask.m, input,
> - mm_shuffle_input.m, mm_ones_16.m,
> + input = transition4(mm_index_mask.x, input,
> + mm_shuffle_input.x, mm_ones_16.x,
> flows.trans, &indices1, &indices2);
>
> - input = transition4(mm_index_mask.m, input,
> - mm_shuffle_input.m, mm_ones_16.m,
> + input = transition4(mm_index_mask.x, input,
> + mm_shuffle_input.x, mm_ones_16.x,
> flows.trans, &indices1, &indices2);
>
> - input = transition4(mm_index_mask.m, input,
> - mm_shuffle_input.m, mm_ones_16.m,
> + input = transition4(mm_index_mask.x, input,
> + mm_shuffle_input.x, mm_ones_16.x,
> flows.trans, &indices1, &indices2);
>
> - input = transition4(mm_index_mask.m, input,
> - mm_shuffle_input.m, mm_ones_16.m,
> + input = transition4(mm_index_mask.x, input,
> + mm_shuffle_input.x, mm_ones_16.x,
> flows.trans, &indices1, &indices2);
>
> /* Check for any matches. */
> acl_match_check_x4(0, ctx, parms, &flows,
> - &indices1, &indices2, mm_match_mask.m);
> + &indices1, &indices2, mm_match_mask.x);
> }
>
> return 0;
> @@ -534,35 +534,35 @@ search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data,
> indices = MM_LOADU((xmm_t *) &index_array[0]);
>
> /* Check for any matches. */
> - acl_match_check_x2(0, ctx, parms, &flows, &indices, mm_match_mask64.m);
> + acl_match_check_x2(0, ctx, parms, &flows, &indices, mm_match_mask64.x);
>
> while (flows.started > 0) {
>
> /* Gather 4 bytes of input data for each stream. */
> - input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0);
> + input = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0), 0);
> input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
>
> /* Process the 4 bytes of input on each stream. */
>
> - input = transition2(mm_index_mask64.m, input,
> - mm_shuffle_input64.m, mm_ones_16.m,
> + input = transition2(mm_index_mask64.x, input,
> + mm_shuffle_input64.x, mm_ones_16.x,
> flows.trans, &indices);
>
> - input = transition2(mm_index_mask64.m, input,
> - mm_shuffle_input64.m, mm_ones_16.m,
> + input = transition2(mm_index_mask64.x, input,
> + mm_shuffle_input64.x, mm_ones_16.x,
> flows.trans, &indices);
>
> - input = transition2(mm_index_mask64.m, input,
> - mm_shuffle_input64.m, mm_ones_16.m,
> + input = transition2(mm_index_mask64.x, input,
> + mm_shuffle_input64.x, mm_ones_16.x,
> flows.trans, &indices);
>
> - input = transition2(mm_index_mask64.m, input,
> - mm_shuffle_input64.m, mm_ones_16.m,
> + input = transition2(mm_index_mask64.x, input,
> + mm_shuffle_input64.x, mm_ones_16.x,
> flows.trans, &indices);
>
> /* Check for any matches. */
> acl_match_check_x2(0, ctx, parms, &flows, &indices,
> - mm_match_mask64.m);
> + mm_match_mask64.x);
> }
>
> return 0;
> diff --git a/lib/librte_eal/common/include/rte_common_vect.h b/lib/librte_eal/common/include/rte_common_vect.h
> index 95bf4b1..617470b 100644
> --- a/lib/librte_eal/common/include/rte_common_vect.h
> +++ b/lib/librte_eal/common/include/rte_common_vect.h
> @@ -54,6 +54,10 @@
> #include <smmintrin.h>
> #endif
>
> +#if defined(__AVX__)
> +#include <immintrin.h>
> +#endif
> +
> #else
>
> #include <x86intrin.h>
> @@ -70,7 +74,7 @@ typedef __m128i xmm_t;
> #define XMM_MASK (XMM_SIZE - 1)
>
> typedef union rte_xmm {
> - xmm_t m;
> + xmm_t x;
> uint8_t u8[XMM_SIZE / sizeof(uint8_t)];
> uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
> uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
> @@ -78,10 +82,29 @@ typedef union rte_xmm {
> double pd[XMM_SIZE / sizeof(double)];
> } rte_xmm_t;
>
> +#ifdef __AVX__
> +
Why are you excluding this type based on instruction availability. I don't see
anything in the definition that makes any of the types included dependent on AVX
availability.
> +typedef __m256i ymm_t;
> +
> +#define YMM_SIZE (sizeof(ymm_t))
> +#define YMM_MASK (YMM_SIZE - 1)
> +
> +typedef union rte_ymm {
> + ymm_t y;
> + xmm_t x[YMM_SIZE / sizeof(xmm_t)];
> + uint8_t u8[YMM_SIZE / sizeof(uint8_t)];
> + uint16_t u16[YMM_SIZE / sizeof(uint16_t)];
> + uint32_t u32[YMM_SIZE / sizeof(uint32_t)];
> + uint64_t u64[YMM_SIZE / sizeof(uint64_t)];
> + double pd[YMM_SIZE / sizeof(double)];
> +} rte_ymm_t;
> +
> +#endif /* __AVX__ */
@@ -1168,7 +1168,7 @@ processx4_step2(const struct lcore_conf *qconf, __m128i dip, uint32_t flag,
if (likely(flag != 0)) {
rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip, dprt, portid);
} else {
- dst.m = dip;
+ dst.x = dip;
dprt[0] = get_dst_port(qconf, pkt[0], dst.u32[0], portid);
dprt[1] = get_dst_port(qconf, pkt[1], dst.u32[1], portid);
dprt[2] = get_dst_port(qconf, pkt[2], dst.u32[2], portid);
@@ -359,16 +359,16 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
/* Check for any matches. */
acl_match_check_x4(0, ctx, parms, &flows,
- &indices1, &indices2, mm_match_mask.m);
+ &indices1, &indices2, mm_match_mask.x);
acl_match_check_x4(4, ctx, parms, &flows,
- &indices3, &indices4, mm_match_mask.m);
+ &indices3, &indices4, mm_match_mask.x);
while (flows.started > 0) {
/* Gather 4 bytes of input data for each stream. */
- input0 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0),
+ input0 = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0),
0);
- input1 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 4),
+ input1 = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 4),
0);
input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 1), 1);
@@ -382,43 +382,43 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
/* Process the 4 bytes of input on each stream. */
- input0 = transition4(mm_index_mask.m, input0,
- mm_shuffle_input.m, mm_ones_16.m,
+ input0 = transition4(mm_index_mask.x, input0,
+ mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices1, &indices2);
- input1 = transition4(mm_index_mask.m, input1,
- mm_shuffle_input.m, mm_ones_16.m,
+ input1 = transition4(mm_index_mask.x, input1,
+ mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices3, &indices4);
- input0 = transition4(mm_index_mask.m, input0,
- mm_shuffle_input.m, mm_ones_16.m,
+ input0 = transition4(mm_index_mask.x, input0,
+ mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices1, &indices2);
- input1 = transition4(mm_index_mask.m, input1,
- mm_shuffle_input.m, mm_ones_16.m,
+ input1 = transition4(mm_index_mask.x, input1,
+ mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices3, &indices4);
- input0 = transition4(mm_index_mask.m, input0,
- mm_shuffle_input.m, mm_ones_16.m,
+ input0 = transition4(mm_index_mask.x, input0,
+ mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices1, &indices2);
- input1 = transition4(mm_index_mask.m, input1,
- mm_shuffle_input.m, mm_ones_16.m,
+ input1 = transition4(mm_index_mask.x, input1,
+ mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices3, &indices4);
- input0 = transition4(mm_index_mask.m, input0,
- mm_shuffle_input.m, mm_ones_16.m,
+ input0 = transition4(mm_index_mask.x, input0,
+ mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices1, &indices2);
- input1 = transition4(mm_index_mask.m, input1,
- mm_shuffle_input.m, mm_ones_16.m,
+ input1 = transition4(mm_index_mask.x, input1,
+ mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices3, &indices4);
/* Check for any matches. */
acl_match_check_x4(0, ctx, parms, &flows,
- &indices1, &indices2, mm_match_mask.m);
+ &indices1, &indices2, mm_match_mask.x);
acl_match_check_x4(4, ctx, parms, &flows,
- &indices3, &indices4, mm_match_mask.m);
+ &indices3, &indices4, mm_match_mask.x);
}
return 0;
@@ -451,36 +451,36 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
/* Check for any matches. */
acl_match_check_x4(0, ctx, parms, &flows,
- &indices1, &indices2, mm_match_mask.m);
+ &indices1, &indices2, mm_match_mask.x);
while (flows.started > 0) {
/* Gather 4 bytes of input data for each stream. */
- input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0);
+ input = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0), 0);
input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 2), 2);
input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 3), 3);
/* Process the 4 bytes of input on each stream. */
- input = transition4(mm_index_mask.m, input,
- mm_shuffle_input.m, mm_ones_16.m,
+ input = transition4(mm_index_mask.x, input,
+ mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices1, &indices2);
- input = transition4(mm_index_mask.m, input,
- mm_shuffle_input.m, mm_ones_16.m,
+ input = transition4(mm_index_mask.x, input,
+ mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices1, &indices2);
- input = transition4(mm_index_mask.m, input,
- mm_shuffle_input.m, mm_ones_16.m,
+ input = transition4(mm_index_mask.x, input,
+ mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices1, &indices2);
- input = transition4(mm_index_mask.m, input,
- mm_shuffle_input.m, mm_ones_16.m,
+ input = transition4(mm_index_mask.x, input,
+ mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices1, &indices2);
/* Check for any matches. */
acl_match_check_x4(0, ctx, parms, &flows,
- &indices1, &indices2, mm_match_mask.m);
+ &indices1, &indices2, mm_match_mask.x);
}
return 0;
@@ -534,35 +534,35 @@ search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data,
indices = MM_LOADU((xmm_t *) &index_array[0]);
/* Check for any matches. */
- acl_match_check_x2(0, ctx, parms, &flows, &indices, mm_match_mask64.m);
+ acl_match_check_x2(0, ctx, parms, &flows, &indices, mm_match_mask64.x);
while (flows.started > 0) {
/* Gather 4 bytes of input data for each stream. */
- input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0);
+ input = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0), 0);
input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
/* Process the 4 bytes of input on each stream. */
- input = transition2(mm_index_mask64.m, input,
- mm_shuffle_input64.m, mm_ones_16.m,
+ input = transition2(mm_index_mask64.x, input,
+ mm_shuffle_input64.x, mm_ones_16.x,
flows.trans, &indices);
- input = transition2(mm_index_mask64.m, input,
- mm_shuffle_input64.m, mm_ones_16.m,
+ input = transition2(mm_index_mask64.x, input,
+ mm_shuffle_input64.x, mm_ones_16.x,
flows.trans, &indices);
- input = transition2(mm_index_mask64.m, input,
- mm_shuffle_input64.m, mm_ones_16.m,
+ input = transition2(mm_index_mask64.x, input,
+ mm_shuffle_input64.x, mm_ones_16.x,
flows.trans, &indices);
- input = transition2(mm_index_mask64.m, input,
- mm_shuffle_input64.m, mm_ones_16.m,
+ input = transition2(mm_index_mask64.x, input,
+ mm_shuffle_input64.x, mm_ones_16.x,
flows.trans, &indices);
/* Check for any matches. */
acl_match_check_x2(0, ctx, parms, &flows, &indices,
- mm_match_mask64.m);
+ mm_match_mask64.x);
}
return 0;
@@ -54,6 +54,10 @@
#include <smmintrin.h>
#endif
+#if defined(__AVX__)
+#include <immintrin.h>
+#endif
+
#else
#include <x86intrin.h>
@@ -70,7 +74,7 @@ typedef __m128i xmm_t;
#define XMM_MASK (XMM_SIZE - 1)
typedef union rte_xmm {
- xmm_t m;
+ xmm_t x;
uint8_t u8[XMM_SIZE / sizeof(uint8_t)];
uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
@@ -78,10 +82,29 @@ typedef union rte_xmm {
double pd[XMM_SIZE / sizeof(double)];
} rte_xmm_t;
+#ifdef __AVX__
+
+typedef __m256i ymm_t;
+
+#define YMM_SIZE (sizeof(ymm_t))
+#define YMM_MASK (YMM_SIZE - 1)
+
+typedef union rte_ymm {
+ ymm_t y;
+ xmm_t x[YMM_SIZE / sizeof(xmm_t)];
+ uint8_t u8[YMM_SIZE / sizeof(uint8_t)];
+ uint16_t u16[YMM_SIZE / sizeof(uint16_t)];
+ uint32_t u32[YMM_SIZE / sizeof(uint32_t)];
+ uint64_t u64[YMM_SIZE / sizeof(uint64_t)];
+ double pd[YMM_SIZE / sizeof(double)];
+} rte_ymm_t;
+
+#endif /* __AVX__ */
+
#ifdef RTE_ARCH_I686
#define _mm_cvtsi128_si64(a) ({ \
rte_xmm_t m; \
- m.m = (a); \
+ m.x = (a); \
(m.u64[0]); \
})
#endif
@@ -420,7 +420,7 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, __m128i ip, uint16_t hop[4],
tbl[3] = *(const uint16_t *)&lpm->tbl24[idx >> 32];
/* get 4 indexes for tbl8[]. */
- i8.m = _mm_and_si128(ip, mask8);
+ i8.x = _mm_and_si128(ip, mask8);
pt = (uint64_t)tbl[0] |
(uint64_t)tbl[1] << 16 |