@@ -1,5 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2022 SmartShare Systems
*/
#include <stdint.h>
@@ -36,6 +37,19 @@ static size_t buf_sizes[TEST_VALUE_RANGE];
/* Data is aligned on this many bytes (power of 2) */
#define ALIGNMENT_UNIT 32
+const uint64_t nt_mode_flags[4] = {
+ 0,
+ RTE_MEMOPS_F_SRC_NT,
+ RTE_MEMOPS_F_DST_NT,
+ RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_DST_NT
+};
+const char * const nt_mode_str[4] = {
+ "none",
+ "src",
+ "dst",
+ "src+dst"
+};
+
/*
* Create two buffers, and initialise one with random values. These are copied
@@ -44,12 +58,13 @@ static size_t buf_sizes[TEST_VALUE_RANGE];
* changed.
*/
static int
-test_single_memcpy(unsigned int off_src, unsigned int off_dst, size_t size)
+test_single_memcpy(unsigned int off_src, unsigned int off_dst, size_t size, unsigned int nt_mode)
{
unsigned int i;
uint8_t dest[SMALL_BUFFER_SIZE + ALIGNMENT_UNIT];
uint8_t src[SMALL_BUFFER_SIZE + ALIGNMENT_UNIT];
void * ret;
+ const uint64_t flags = nt_mode_flags[nt_mode];
/* Setup buffers */
for (i = 0; i < SMALL_BUFFER_SIZE + ALIGNMENT_UNIT; i++) {
@@ -58,18 +73,23 @@ test_single_memcpy(unsigned int off_src, unsigned int off_dst, size_t size)
}
/* Do the copy */
- ret = rte_memcpy(dest + off_dst, src + off_src, size);
- if (ret != (dest + off_dst)) {
- printf("rte_memcpy() returned %p, not %p\n",
- ret, dest + off_dst);
+ if (nt_mode) {
+ rte_memcpy_ex(dest + off_dst, src + off_src, size, flags);
+ } else {
+ ret = rte_memcpy(dest + off_dst, src + off_src, size);
+ if (ret != (dest + off_dst)) {
+ printf("rte_memcpy() returned %p, not %p\n",
+ ret, dest + off_dst);
+ }
}
/* Check nothing before offset is affected */
for (i = 0; i < off_dst; i++) {
if (dest[i] != 0) {
- printf("rte_memcpy() failed for %u bytes (offsets=%u,%u): "
+ printf("rte_memcpy%s() failed for %u bytes (offsets=%u,%u nt=%s): "
"[modified before start of dst].\n",
- (unsigned)size, off_src, off_dst);
+ nt_mode ? "_ex" : "",
+ (unsigned int)size, off_src, off_dst, nt_mode_str[nt_mode]);
return -1;
}
}
@@ -77,9 +97,11 @@ test_single_memcpy(unsigned int off_src, unsigned int off_dst, size_t size)
/* Check everything was copied */
for (i = 0; i < size; i++) {
if (dest[i + off_dst] != src[i + off_src]) {
- printf("rte_memcpy() failed for %u bytes (offsets=%u,%u): "
- "[didn't copy byte %u].\n",
- (unsigned)size, off_src, off_dst, i);
+ printf("rte_memcpy%s() failed for %u bytes (offsets=%u,%u nt=%s): "
+ "[didn't copy byte %u: 0x%02x!=0x%02x].\n",
+ nt_mode ? "_ex" : "",
+ (unsigned int)size, off_src, off_dst, nt_mode_str[nt_mode], i,
+ dest[i + off_dst], src[i + off_src]);
return -1;
}
}
@@ -87,9 +109,10 @@ test_single_memcpy(unsigned int off_src, unsigned int off_dst, size_t size)
/* Check nothing after copy was affected */
for (i = size; i < SMALL_BUFFER_SIZE; i++) {
if (dest[i + off_dst] != 0) {
- printf("rte_memcpy() failed for %u bytes (offsets=%u,%u): "
+ printf("rte_memcpy%s() failed for %u bytes (offsets=%u,%u nt=%s): "
"[copied too many].\n",
- (unsigned)size, off_src, off_dst);
+ nt_mode ? "_ex" : "",
+ (unsigned int)size, off_src, off_dst, nt_mode_str[nt_mode]);
return -1;
}
}
@@ -102,16 +125,22 @@ test_single_memcpy(unsigned int off_src, unsigned int off_dst, size_t size)
static int
func_test(void)
{
- unsigned int off_src, off_dst, i;
+ unsigned int off_src, off_dst, i, nt_mode;
int ret;
- for (off_src = 0; off_src < ALIGNMENT_UNIT; off_src++) {
- for (off_dst = 0; off_dst < ALIGNMENT_UNIT; off_dst++) {
- for (i = 0; i < RTE_DIM(buf_sizes); i++) {
- ret = test_single_memcpy(off_src, off_dst,
- buf_sizes[i]);
- if (ret != 0)
- return -1;
+ for (nt_mode = 0; nt_mode < 4; nt_mode++) {
+ for (off_src = 0; off_src < ALIGNMENT_UNIT; off_src++) {
+ for (off_dst = 0; off_dst < ALIGNMENT_UNIT; off_dst++) {
+ for (i = 0; i < RTE_DIM(buf_sizes); i++) {
+ printf("TEST: rte_memcpy%s(offsets=%u,%u size=%zu nt=%s)\n",
+ nt_mode ? "_ex" : "",
+ off_src, off_dst, buf_sizes[i],
+ nt_mode_str[nt_mode]);
+ ret = test_single_memcpy(off_src, off_dst,
+ buf_sizes[i], nt_mode);
+ if (ret != 0)
+ return -1;
+ }
}
}
}
@@ -1,5 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2022 SmartShare Systems
*/
#include <stdint.h>
@@ -15,6 +16,7 @@
#include <rte_malloc.h>
#include <rte_memcpy.h>
+#include <rte_atomic.h>
#include "test.h"
@@ -27,9 +29,9 @@
/* List of buffer sizes to test */
#if TEST_VALUE_RANGE == 0
static size_t buf_sizes[] = {
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128,
- 129, 191, 192, 193, 255, 256, 257, 319, 320, 321, 383, 384, 385, 447, 448,
- 449, 511, 512, 513, 767, 768, 769, 1023, 1024, 1025, 1518, 1522, 1536, 1600,
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 31, 32, 33, 40, 48, 60, 63, 64, 65, 80, 92, 124,
+ 127, 128, 129, 140, 152, 191, 192, 193, 255, 256, 257, 319, 320, 321, 383, 384, 385, 447,
+ 448, 449, 511, 512, 513, 767, 768, 769, 1023, 1024, 1025, 1518, 1522, 1536, 1600,
2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192
};
/* MUST be as large as largest packet size above */
@@ -72,7 +74,7 @@ static uint8_t *small_buf_read, *small_buf_write;
static int
init_buffers(void)
{
- unsigned i;
+ unsigned int i;
large_buf_read = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
if (large_buf_read == NULL)
@@ -151,7 +153,7 @@ static void
do_uncached_write(uint8_t *dst, int is_dst_cached,
const uint8_t *src, int is_src_cached, size_t size)
{
- unsigned i, j;
+ unsigned int i, j;
size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE];
for (i = 0; i < (TEST_ITERATIONS / TEST_BATCH_SIZE); i++) {
@@ -167,66 +169,112 @@ do_uncached_write(uint8_t *dst, int is_dst_cached,
* Run a single memcpy performance test. This is a macro to ensure that if
* the "size" parameter is a constant it won't be converted to a variable.
*/
-#define SINGLE_PERF_TEST(dst, is_dst_cached, dst_uoffset, \
- src, is_src_cached, src_uoffset, size) \
-do { \
- unsigned int iter, t; \
- size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE]; \
- uint64_t start_time, total_time = 0; \
- uint64_t total_time2 = 0; \
- for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) { \
- fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset, \
- src_addrs, is_src_cached, src_uoffset); \
- start_time = rte_rdtsc(); \
- for (t = 0; t < TEST_BATCH_SIZE; t++) \
- rte_memcpy(dst+dst_addrs[t], src+src_addrs[t], size); \
- total_time += rte_rdtsc() - start_time; \
- } \
- for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) { \
- fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset, \
- src_addrs, is_src_cached, src_uoffset); \
- start_time = rte_rdtsc(); \
- for (t = 0; t < TEST_BATCH_SIZE; t++) \
- memcpy(dst+dst_addrs[t], src+src_addrs[t], size); \
- total_time2 += rte_rdtsc() - start_time; \
- } \
- printf("%3.0f -", (double)total_time / TEST_ITERATIONS); \
- printf("%3.0f", (double)total_time2 / TEST_ITERATIONS); \
- printf("(%6.2f%%) ", ((double)total_time - total_time2)*100/total_time2); \
+#define SINGLE_PERF_TEST(dst, is_dst_cached, dst_uoffset, \
+ src, is_src_cached, src_uoffset, size) \
+do { \
+ unsigned int iter, t; \
+ size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE]; \
+ uint64_t start_time; \
+ uint64_t total_time_rte = 0, total_time_std = 0; \
+ uint64_t total_time_ntd = 0, total_time_nts = 0, total_time_nt = 0; \
+ const uint64_t flags = ((dst_uoffset == 0) ? \
+ (ALIGNMENT_UNIT << RTE_MEMOPS_F_DSTA_SHIFT) : 0) | \
+ ((src_uoffset == 0) ? \
+ (ALIGNMENT_UNIT << RTE_MEMOPS_F_SRCA_SHIFT) : 0); \
+ for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) { \
+ fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset, \
+ src_addrs, is_src_cached, src_uoffset); \
+ start_time = rte_rdtsc(); \
+ for (t = 0; t < TEST_BATCH_SIZE; t++) \
+ rte_memcpy(dst + dst_addrs[t], src + src_addrs[t], size); \
+ total_time_rte += rte_rdtsc() - start_time; \
+ } \
+ for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) { \
+ fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset, \
+ src_addrs, is_src_cached, src_uoffset); \
+ start_time = rte_rdtsc(); \
+ for (t = 0; t < TEST_BATCH_SIZE; t++) \
+ memcpy(dst + dst_addrs[t], src + src_addrs[t], size); \
+ total_time_std += rte_rdtsc() - start_time; \
+ } \
+ if (!(is_dst_cached && is_src_cached)) { \
+ for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) { \
+ fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset, \
+ src_addrs, is_src_cached, src_uoffset); \
+ start_time = rte_rdtsc(); \
+ for (t = 0; t < TEST_BATCH_SIZE; t++) \
+ rte_memcpy_ex(dst + dst_addrs[t], src + src_addrs[t], size, \
+ flags | RTE_MEMOPS_F_DST_NT); \
+ total_time_ntd += rte_rdtsc() - start_time; \
+ } \
+ for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) { \
+ fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset, \
+ src_addrs, is_src_cached, src_uoffset); \
+ start_time = rte_rdtsc(); \
+ for (t = 0; t < TEST_BATCH_SIZE; t++) \
+ rte_memcpy_ex(dst + dst_addrs[t], src + src_addrs[t], size, \
+ flags | RTE_MEMOPS_F_SRC_NT); \
+ total_time_nts += rte_rdtsc() - start_time; \
+ } \
+ for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) { \
+ fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset, \
+ src_addrs, is_src_cached, src_uoffset); \
+ start_time = rte_rdtsc(); \
+ for (t = 0; t < TEST_BATCH_SIZE; t++) \
+ rte_memcpy_ex(dst + dst_addrs[t], src + src_addrs[t], size, \
+ flags | RTE_MEMOPS_F_DST_NT | RTE_MEMOPS_F_SRC_NT); \
+ total_time_nt += rte_rdtsc() - start_time; \
+ } \
+ } \
+ printf(" %4.0f-", (double)total_time_rte / TEST_ITERATIONS); \
+ printf("%4.0f", (double)total_time_std / TEST_ITERATIONS); \
+ printf("(%+4.0f%%)", ((double)total_time_rte - total_time_std) * 100 / total_time_std); \
+ if (!(is_dst_cached && is_src_cached)) { \
+ printf(" %4.0f", (double)total_time_ntd / TEST_ITERATIONS); \
+ printf(" %4.0f", (double)total_time_nts / TEST_ITERATIONS); \
+ printf(" %4.0f", (double)total_time_nt / TEST_ITERATIONS); \
+ if (total_time_nt / total_time_std > 9) \
+ printf("(*%4.1f)", (double)total_time_nt / total_time_std); \
+ else \
+ printf("(%+4.0f%%)", \
+ ((double)total_time_nt - total_time_std) * 100 / total_time_std); \
+ } \
} while (0)
/* Run aligned memcpy tests for each cached/uncached permutation */
-#define ALL_PERF_TESTS_FOR_SIZE(n) \
-do { \
- if (__builtin_constant_p(n)) \
- printf("\nC%6u", (unsigned)n); \
- else \
- printf("\n%7u", (unsigned)n); \
- SINGLE_PERF_TEST(small_buf_write, 1, 0, small_buf_read, 1, 0, n); \
- SINGLE_PERF_TEST(large_buf_write, 0, 0, small_buf_read, 1, 0, n); \
- SINGLE_PERF_TEST(small_buf_write, 1, 0, large_buf_read, 0, 0, n); \
- SINGLE_PERF_TEST(large_buf_write, 0, 0, large_buf_read, 0, 0, n); \
+#define ALL_PERF_TESTS_FOR_SIZE(n) \
+do { \
+ if (__builtin_constant_p(n)) \
+ printf("\nC%6u", (unsigned int)n); \
+ else \
+ printf("\n%7u", (unsigned int)n); \
+ SINGLE_PERF_TEST(small_buf_write, 1, 0, small_buf_read, 1, 0, n); \
+ SINGLE_PERF_TEST(large_buf_write, 0, 0, small_buf_read, 1, 0, n); \
+ SINGLE_PERF_TEST(small_buf_write, 1, 0, large_buf_read, 0, 0, n); \
+ SINGLE_PERF_TEST(large_buf_write, 0, 0, large_buf_read, 0, 0, n); \
} while (0)
/* Run unaligned memcpy tests for each cached/uncached permutation */
-#define ALL_PERF_TESTS_FOR_SIZE_UNALIGNED(n) \
-do { \
- if (__builtin_constant_p(n)) \
- printf("\nC%6u", (unsigned)n); \
- else \
- printf("\n%7u", (unsigned)n); \
- SINGLE_PERF_TEST(small_buf_write, 1, 1, small_buf_read, 1, 5, n); \
- SINGLE_PERF_TEST(large_buf_write, 0, 1, small_buf_read, 1, 5, n); \
- SINGLE_PERF_TEST(small_buf_write, 1, 1, large_buf_read, 0, 5, n); \
- SINGLE_PERF_TEST(large_buf_write, 0, 1, large_buf_read, 0, 5, n); \
+#define ALL_PERF_TESTS_FOR_SIZE_UNALIGNED(n) \
+do { \
+ if (__builtin_constant_p(n)) \
+ printf("\nC%6u", (unsigned int)n); \
+ else \
+ printf("\n%7u", (unsigned int)n); \
+ SINGLE_PERF_TEST(small_buf_write, 1, 1, small_buf_read, 1, 5, n); \
+ SINGLE_PERF_TEST(large_buf_write, 0, 1, small_buf_read, 1, 5, n); \
+ SINGLE_PERF_TEST(small_buf_write, 1, 1, large_buf_read, 0, 5, n); \
+ SINGLE_PERF_TEST(large_buf_write, 0, 1, large_buf_read, 0, 5, n); \
} while (0)
/* Run memcpy tests for constant length */
-#define ALL_PERF_TEST_FOR_CONSTANT \
-do { \
- TEST_CONSTANT(6U); TEST_CONSTANT(64U); TEST_CONSTANT(128U); \
- TEST_CONSTANT(192U); TEST_CONSTANT(256U); TEST_CONSTANT(512U); \
- TEST_CONSTANT(768U); TEST_CONSTANT(1024U); TEST_CONSTANT(1536U); \
+#define ALL_PERF_TEST_FOR_CONSTANT \
+do { \
+ TEST_CONSTANT(4U); TEST_CONSTANT(6U); TEST_CONSTANT(8U); \
+ TEST_CONSTANT(16U); TEST_CONSTANT(64U); TEST_CONSTANT(128U); \
+ TEST_CONSTANT(192U); TEST_CONSTANT(256U); TEST_CONSTANT(512U); \
+ TEST_CONSTANT(768U); TEST_CONSTANT(1024U); TEST_CONSTANT(1536U); \
+ TEST_CONSTANT(2048U); \
} while (0)
/* Run all memcpy tests for aligned constant cases */
@@ -251,7 +299,7 @@ perf_test_constant_unaligned(void)
static inline void
perf_test_variable_aligned(void)
{
- unsigned i;
+ unsigned int i;
for (i = 0; i < RTE_DIM(buf_sizes); i++) {
ALL_PERF_TESTS_FOR_SIZE((size_t)buf_sizes[i]);
}
@@ -261,7 +309,7 @@ perf_test_variable_aligned(void)
static inline void
perf_test_variable_unaligned(void)
{
- unsigned i;
+ unsigned int i;
for (i = 0; i < RTE_DIM(buf_sizes); i++) {
ALL_PERF_TESTS_FOR_SIZE_UNALIGNED((size_t)buf_sizes[i]);
}
@@ -282,7 +330,7 @@ perf_test(void)
#if TEST_VALUE_RANGE != 0
/* Set up buf_sizes array, if required */
- unsigned i;
+ unsigned int i;
for (i = 0; i < TEST_VALUE_RANGE; i++)
buf_sizes[i] = i;
#endif
@@ -290,13 +338,14 @@ perf_test(void)
/* See function comment */
do_uncached_write(large_buf_write, 0, small_buf_read, 1, SMALL_BUFFER_SIZE);
- printf("\n** rte_memcpy() - memcpy perf. tests (C = compile-time constant) **\n"
- "======= ================= ================= ================= =================\n"
- " Size Cache to cache Cache to mem Mem to cache Mem to mem\n"
- "(bytes) (ticks) (ticks) (ticks) (ticks)\n"
- "------- ----------------- ----------------- ----------------- -----------------");
+ printf("\n** rte_memcpy(RTE)/memcpy(STD)/rte_memcpy_ex(NTD/NTS/NT) - memcpy perf. tests (C = compile-time constant) **\n"
+ "======= ================ ====================================== ====================================== ======================================\n"
+ " Size Cache to cache Cache to mem Mem to cache Mem to mem\n"
+ "(bytes) (ticks) (ticks) (ticks) (ticks)\n"
+ " RTE- STD(diff%%) RTE- STD(diff%%) NTD NTS NT(diff%%) RTE- STD(diff%%) NTD NTS NT(diff%%) RTE- STD(diff%%) NTD NTS NT(diff%%)\n"
+ "------- ---------------- -------------------------------------- -------------------------------------- --------------------------------------");
- printf("\n================================= %2dB aligned =================================",
+ printf("\n================================================================ %2dB aligned ===============================================================",
ALIGNMENT_UNIT);
/* Do aligned tests where size is a variable */
timespec_get(&tv_begin, TIME_UTC);
@@ -304,28 +353,28 @@ perf_test(void)
timespec_get(&tv_end, TIME_UTC);
time_aligned = (double)(tv_end.tv_sec - tv_begin.tv_sec)
+ ((double)tv_end.tv_nsec - tv_begin.tv_nsec) / NS_PER_S;
- printf("\n------- ----------------- ----------------- ----------------- -----------------");
+ printf("\n------- ---------------- -------------------------------------- -------------------------------------- --------------------------------------");
/* Do aligned tests where size is a compile-time constant */
timespec_get(&tv_begin, TIME_UTC);
perf_test_constant_aligned();
timespec_get(&tv_end, TIME_UTC);
time_aligned_const = (double)(tv_end.tv_sec - tv_begin.tv_sec)
+ ((double)tv_end.tv_nsec - tv_begin.tv_nsec) / NS_PER_S;
- printf("\n================================== Unaligned ==================================");
+ printf("\n================================================================= Unaligned =================================================================");
/* Do unaligned tests where size is a variable */
timespec_get(&tv_begin, TIME_UTC);
perf_test_variable_unaligned();
timespec_get(&tv_end, TIME_UTC);
time_unaligned = (double)(tv_end.tv_sec - tv_begin.tv_sec)
+ ((double)tv_end.tv_nsec - tv_begin.tv_nsec) / NS_PER_S;
- printf("\n------- ----------------- ----------------- ----------------- -----------------");
+ printf("\n------- ---------------- -------------------------------------- -------------------------------------- --------------------------------------");
/* Do unaligned tests where size is a compile-time constant */
timespec_get(&tv_begin, TIME_UTC);
perf_test_constant_unaligned();
timespec_get(&tv_end, TIME_UTC);
time_unaligned_const = (double)(tv_end.tv_sec - tv_begin.tv_sec)
+ ((double)tv_end.tv_nsec - tv_begin.tv_nsec) / NS_PER_S;
- printf("\n======= ================= ================= ================= =================\n\n");
+ printf("\n======= ================ ====================================== ====================================== ======================================\n\n");
printf("Test Execution Time (seconds):\n");
printf("Aligned variable copy size = %8.3f\n", time_aligned);
@@ -1,5 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2022 SmartShare Systems
*/
#ifndef _RTE_MEMCPY_H_
@@ -11,6 +12,9 @@
* Functions for vectorised implementation of memcpy().
*/
+#include <rte_common.h>
+#include <rte_compat.h>
+
/**
* Copy 16 bytes from one location to another using optimised
* instructions. The locations should not overlap.
@@ -113,4 +117,119 @@ rte_memcpy(void *dst, const void *src, size_t n);
#endif /* __DOXYGEN__ */
+/*
+ * Advanced/Non-Temporal Memory Operations Flags.
+ */
+
+/** Length alignment hint mask. */
+#define RTE_MEMOPS_F_LENA_MASK (UINT64_C(0xFE) << 0)
+/** Length alignment hint shift. */
+#define RTE_MEMOPS_F_LENA_SHIFT 0
+/** Hint: Length is 2 byte aligned. */
+#define RTE_MEMOPS_F_LEN2A (UINT64_C(2) << 0)
+/** Hint: Length is 4 byte aligned. */
+#define RTE_MEMOPS_F_LEN4A (UINT64_C(4) << 0)
+/** Hint: Length is 8 byte aligned. */
+#define RTE_MEMOPS_F_LEN8A (UINT64_C(8) << 0)
+/** Hint: Length is 16 byte aligned. */
+#define RTE_MEMOPS_F_LEN16A (UINT64_C(16) << 0)
+/** Hint: Length is 32 byte aligned. */
+#define RTE_MEMOPS_F_LEN32A (UINT64_C(32) << 0)
+/** Hint: Length is 64 byte aligned. */
+#define RTE_MEMOPS_F_LEN64A (UINT64_C(64) << 0)
+/** Hint: Length is 128 byte aligned. */
+#define RTE_MEMOPS_F_LEN128A (UINT64_C(128) << 0)
+
+/** Prefer non-temporal access to source memory area.
+ */
+#define RTE_MEMOPS_F_SRC_NT (UINT64_C(1) << 8)
+/** Source address alignment hint mask. */
+#define RTE_MEMOPS_F_SRCA_MASK (UINT64_C(0xFE) << 8)
+/** Source address alignment hint shift. */
+#define RTE_MEMOPS_F_SRCA_SHIFT 8
+/** Hint: Source address is 2 byte aligned. */
+#define RTE_MEMOPS_F_SRC2A (UINT64_C(2) << 8)
+/** Hint: Source address is 4 byte aligned. */
+#define RTE_MEMOPS_F_SRC4A (UINT64_C(4) << 8)
+/** Hint: Source address is 8 byte aligned. */
+#define RTE_MEMOPS_F_SRC8A (UINT64_C(8) << 8)
+/** Hint: Source address is 16 byte aligned. */
+#define RTE_MEMOPS_F_SRC16A (UINT64_C(16) << 8)
+/** Hint: Source address is 32 byte aligned. */
+#define RTE_MEMOPS_F_SRC32A (UINT64_C(32) << 8)
+/** Hint: Source address is 64 byte aligned. */
+#define RTE_MEMOPS_F_SRC64A (UINT64_C(64) << 8)
+/** Hint: Source address is 128 byte aligned. */
+#define RTE_MEMOPS_F_SRC128A (UINT64_C(128) << 8)
+
+/** Prefer non-temporal access to destination memory area.
+ *
+ * On x86 architecture:
+ * Remember to call rte_wmb() after a sequence of copy operations.
+ */
+#define RTE_MEMOPS_F_DST_NT (UINT64_C(1) << 16)
+/** Destination address alignment hint mask. */
+#define RTE_MEMOPS_F_DSTA_MASK (UINT64_C(0xFE) << 16)
+/** Destination address alignment hint shift. */
+#define RTE_MEMOPS_F_DSTA_SHIFT 16
+/** Hint: Destination address is 2 byte aligned. */
+#define RTE_MEMOPS_F_DST2A (UINT64_C(2) << 16)
+/** Hint: Destination address is 4 byte aligned. */
+#define RTE_MEMOPS_F_DST4A (UINT64_C(4) << 16)
+/** Hint: Destination address is 8 byte aligned. */
+#define RTE_MEMOPS_F_DST8A (UINT64_C(8) << 16)
+/** Hint: Destination address is 16 byte aligned. */
+#define RTE_MEMOPS_F_DST16A (UINT64_C(16) << 16)
+/** Hint: Destination address is 32 byte aligned. */
+#define RTE_MEMOPS_F_DST32A (UINT64_C(32) << 16)
+/** Hint: Destination address is 64 byte aligned. */
+#define RTE_MEMOPS_F_DST64A (UINT64_C(64) << 16)
+/** Hint: Destination address is 128 byte aligned. */
+#define RTE_MEMOPS_F_DST128A (UINT64_C(128) << 16)
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Advanced/non-temporal memory copy.
+ * The memory areas must not overlap.
+ *
+ * @param dst
+ * Pointer to the destination memory area.
+ * @param src
+ * Pointer to the source memory area.
+ * @param len
+ * Number of bytes to copy.
+ * @param flags
+ * Hints for memory access.
+ * Any of the RTE_MEMOPS_F_(SRC|DST)_NT, RTE_MEMOPS_F_(LEN|SRC|DST)<n>A flags.
+ * Must be constant at build time.
+ */
+__rte_experimental
+static __rte_always_inline
+__attribute__((__nonnull__(1, 2)))
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
+__attribute__((__access__(write_only, 1, 3), __access__(read_only, 2, 3)))
+#endif
+void rte_memcpy_ex(void *__rte_restrict dst, const void *__rte_restrict src, size_t len,
+ const uint64_t flags);
+
+#ifndef RTE_MEMCPY_EX_ARCH_DEFINED
+
+/* Fallback implementation, if no arch-specific implementation is provided. */
+__rte_experimental
+static __rte_always_inline
+__attribute__((__nonnull__(1, 2)))
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
+__attribute__((__access__(write_only, 1, 3), __access__(read_only, 2, 3)))
+#endif
+void rte_memcpy_ex(void *__rte_restrict dst, const void *__rte_restrict src, size_t len,
+ const uint64_t flags)
+{
+ RTE_SET_USED(flags);
+ memcpy(dst, src, len);
+}
+
+#endif /* RTE_MEMCPY_EX_ARCH_DEFINED */
+
#endif /* _RTE_MEMCPY_H_ */
@@ -1,5 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2022 SmartShare Systems
*/
#ifndef _RTE_MEMCPY_X86_64_H_
@@ -17,6 +18,10 @@
#include <rte_vect.h>
#include <rte_common.h>
#include <rte_config.h>
+#include <rte_debug.h>
+
+#define RTE_MEMCPY_EX_ARCH_DEFINED
+#include "generic/rte_memcpy.h"
#ifdef __cplusplus
extern "C" {
@@ -868,6 +873,1204 @@ rte_memcpy(void *dst, const void *src, size_t n)
return rte_memcpy_generic(dst, src, n);
}
+/*
+ * Advanced/Non-Temporal Memory Operations.
+ */
+
+/**
+ * @internal
+ * Workaround for _mm_stream_load_si128() missing const in the parameter.
+ */
+__rte_internal
+static __rte_always_inline
+__m128i _mm_stream_load_si128_const(const __m128i * const mem_addr)
+{
+#if defined(RTE_TOOLCHAIN_GCC)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
+#endif
+ return _mm_stream_load_si128(mem_addr);
+#if defined(RTE_TOOLCHAIN_GCC)
+#pragma GCC diagnostic pop
+#endif
+}
+
+/**
+ * @internal
+ * Memory copy from non-temporal source area.
+ *
+ * @note
+ * Performance is optimal when source pointer is 16 byte aligned.
+ *
+ * @param dst
+ * Pointer to the destination memory area.
+ * @param src
+ * Pointer to the non-temporal source memory area.
+ * @param len
+ * Number of bytes to copy.
+ * @param flags
+ * Hints for memory access.
+ * Any of the RTE_MEMOPS_F_(LEN|SRC)<n>A flags.
+ * The RTE_MEMOPS_F_SRC_NT flag must be set.
+ * The RTE_MEMOPS_F_DST_NT flag must be clear.
+ * The RTE_MEMOPS_F_DST<n>A flags are ignored.
+ * Must be constant at build time.
+ */
+__rte_internal
+static __rte_always_inline
+__attribute__((__nonnull__(1, 2)))
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
+__attribute__((__access__(write_only, 1, 3), __access__(read_only, 2, 3)))
+#endif
+void rte_memcpy_nts(void *__rte_restrict dst, const void *__rte_restrict src, size_t len,
+ const uint64_t flags)
+{
+ register __m128i xmm0, xmm1, xmm2, xmm3;
+
+#ifndef RTE_TOOLCHAIN_CLANG /* Clang doesn't support using __builtin_constant_p() like this. */
+ RTE_BUILD_BUG_ON(!__builtin_constant_p(flags));
+#endif /* !RTE_TOOLCHAIN_CLANG */
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_SRCA_MASK) || rte_is_aligned(src,
+ (flags & RTE_MEMOPS_F_SRCA_MASK) >> RTE_MEMOPS_F_SRCA_SHIFT));
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_LENA_MASK) || (len &
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >> RTE_MEMOPS_F_LENA_SHIFT) - 1) == 0);
+
+ RTE_ASSERT((flags & (RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_DST_NT)) == RTE_MEMOPS_F_SRC_NT);
+
+ if (unlikely(len == 0))
+ return;
+
+ /* If source is not 16 byte aligned, then copy first part of data via bounce buffer,
+ * to achieve 16 byte alignment of source pointer.
+ * This invalidates the source, destination and length alignment flags, and
+ * potentially makes the destination pointer unaligned.
+ *
+ * Omitted if source is known to be 16 byte aligned.
+ */
+ if (!((flags & RTE_MEMOPS_F_SRCA_MASK) >= RTE_MEMOPS_F_SRC16A)) {
+ /* Source is not known to be 16 byte aligned, but might be. */
+ /** How many bytes is source offset from 16 byte alignment (floor rounding). */
+ const size_t offset = (uintptr_t)src & 15;
+
+ if (offset) {
+ /* Source is not 16 byte aligned. */
+ char buffer[16] __rte_aligned(16);
+ /** How many bytes is source away from 16 byte alignment
+ * (ceiling rounding).
+ */
+ const size_t first = 16 - offset;
+
+ xmm0 = _mm_stream_load_si128_const(RTE_PTR_SUB(src, offset));
+ _mm_store_si128((void *)buffer, xmm0);
+
+ /* Test for short length.
+ *
+ * Omitted if length is known to be >= 16.
+ */
+ if (!(__builtin_constant_p(len) && len >= 16) &&
+ unlikely(len <= first)) {
+ /* Short length. */
+ rte_mov15_or_less(dst, RTE_PTR_ADD(buffer, offset), len);
+ return;
+ }
+
+ /* Copy until source pointer is 16 byte aligned. */
+ rte_mov15_or_less(dst, RTE_PTR_ADD(buffer, offset), first);
+ src = RTE_PTR_ADD(src, first);
+ dst = RTE_PTR_ADD(dst, first);
+ len -= first;
+ }
+ }
+
+ /* Source pointer is now 16 byte aligned. */
+ RTE_ASSERT(rte_is_aligned(src, 16));
+
+ /* Copy large portion of data in chunks of 64 byte. */
+ while (len >= 64) {
+ xmm0 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 0 * 16));
+ xmm1 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 1 * 16));
+ xmm2 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 2 * 16));
+ xmm3 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 3 * 16));
+ _mm_storeu_si128(RTE_PTR_ADD(dst, 0 * 16), xmm0);
+ _mm_storeu_si128(RTE_PTR_ADD(dst, 1 * 16), xmm1);
+ _mm_storeu_si128(RTE_PTR_ADD(dst, 2 * 16), xmm2);
+ _mm_storeu_si128(RTE_PTR_ADD(dst, 3 * 16), xmm3);
+ src = RTE_PTR_ADD(src, 64);
+ dst = RTE_PTR_ADD(dst, 64);
+ len -= 64;
+ }
+
+ /* Copy following 32 and 16 byte portions of data.
+ *
+ * Omitted if source is known to be 16 byte aligned (so the alignment
+ * flags are still valid)
+ * and length is known to be respectively 64 or 32 byte aligned.
+ */
+ if (!(((flags & RTE_MEMOPS_F_SRCA_MASK) >= RTE_MEMOPS_F_SRC16A) &&
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN64A)) &&
+ (len & 32)) {
+ xmm0 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 0 * 16));
+ xmm1 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 1 * 16));
+ _mm_storeu_si128(RTE_PTR_ADD(dst, 0 * 16), xmm0);
+ _mm_storeu_si128(RTE_PTR_ADD(dst, 1 * 16), xmm1);
+ src = RTE_PTR_ADD(src, 32);
+ dst = RTE_PTR_ADD(dst, 32);
+ }
+ if (!(((flags & RTE_MEMOPS_F_SRCA_MASK) >= RTE_MEMOPS_F_SRC16A) &&
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN32A)) &&
+ (len & 16)) {
+ xmm2 = _mm_stream_load_si128_const(src);
+ _mm_storeu_si128(dst, xmm2);
+ src = RTE_PTR_ADD(src, 16);
+ dst = RTE_PTR_ADD(dst, 16);
+ }
+
+ /* Copy remaining data, 15 byte or less, if any, via bounce buffer.
+ *
+ * Omitted if source is known to be 16 byte aligned (so the alignment
+ * flags are still valid) and length is known to be 16 byte aligned.
+ */
+ if (!(((flags & RTE_MEMOPS_F_SRCA_MASK) >= RTE_MEMOPS_F_SRC16A) &&
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN16A)) &&
+ (len & 15)) {
+ char buffer[16] __rte_aligned(16);
+
+ xmm3 = _mm_stream_load_si128_const(src);
+ _mm_store_si128((void *)buffer, xmm3);
+ rte_mov15_or_less(dst, buffer, len & 15);
+ }
+}
+
+/**
+ * @internal
+ * Memory copy to non-temporal destination area.
+ *
+ * @note
+ * If the destination and/or length is unaligned, the first and/or last copied
+ * bytes will be stored in the destination memory area using temporal access.
+ * @note
+ * Performance is optimal when destination pointer is 16 byte aligned.
+ *
+ * @param dst
+ * Pointer to the non-temporal destination memory area.
+ * @param src
+ * Pointer to the source memory area.
+ * @param len
+ * Number of bytes to copy.
+ * @param flags
+ * Hints for memory access.
+ * Any of the RTE_MEMOPS_F_(LEN|DST)<n>A flags.
+ * The RTE_MEMOPS_F_SRC_NT flag must be clear.
+ * The RTE_MEMOPS_F_DST_NT flag must be set.
+ * The RTE_MEMOPS_F_SRC<n>A flags are ignored.
+ * Must be constant at build time.
+ */
+__rte_internal
+static __rte_always_inline
+__attribute__((__nonnull__(1, 2)))
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
+__attribute__((__access__(write_only, 1, 3), __access__(read_only, 2, 3)))
+#endif
+void rte_memcpy_ntd(void *__rte_restrict dst, const void *__rte_restrict src, size_t len,
+ const uint64_t flags)
+{
+#ifndef RTE_TOOLCHAIN_CLANG /* Clang doesn't support using __builtin_constant_p() like this. */
+ RTE_BUILD_BUG_ON(!__builtin_constant_p(flags));
+#endif /* !RTE_TOOLCHAIN_CLANG */
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_DSTA_MASK) || rte_is_aligned(dst,
+ (flags & RTE_MEMOPS_F_DSTA_MASK) >> RTE_MEMOPS_F_DSTA_SHIFT));
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_LENA_MASK) || (len &
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >> RTE_MEMOPS_F_LENA_SHIFT) - 1) == 0);
+
+ RTE_ASSERT((flags & (RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_DST_NT)) == RTE_MEMOPS_F_DST_NT);
+
+ if (unlikely(len == 0))
+ return;
+
+ if (((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST16A) ||
+ len >= 16) {
+ /* Length >= 16 and/or destination is known to be 16 byte aligned. */
+ register __m128i xmm0, xmm1, xmm2, xmm3;
+
+ /* If destination is not 16 byte aligned, then copy first part of data,
+ * to achieve 16 byte alignment of destination pointer.
+ * This invalidates the source, destination and length alignment flags, and
+ * potentially makes the source pointer unaligned.
+ *
+ * Omitted if destination is known to be 16 byte aligned.
+ */
+ if (!((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST16A)) {
+ /* Destination is not known to be 16 byte aligned, but might be. */
+ /** How many bytes is destination offset from 16 byte alignment
+ * (floor rounding).
+ */
+ const size_t offset = (uintptr_t)dst & 15;
+
+ if (offset) {
+ /* Destination is not 16 byte aligned. */
+ /** How many bytes is destination away from 16 byte alignment
+ * (ceiling rounding).
+ */
+ const size_t first = 16 - offset;
+
+ if (((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST4A) ||
+ (offset & 3) == 0) {
+ /* Destination is (known to be) 4 byte aligned. */
+ int32_t r0, r1, r2;
+
+ /* Copy until destination pointer is 16 byte aligned. */
+ if (first & 8) {
+ memcpy(&r0, RTE_PTR_ADD(src, 0 * 4), 4);
+ memcpy(&r1, RTE_PTR_ADD(src, 1 * 4), 4);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 0 * 4), r0);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 1 * 4), r1);
+ src = RTE_PTR_ADD(src, 8);
+ dst = RTE_PTR_ADD(dst, 8);
+ len -= 8;
+ }
+ if (first & 4) {
+ memcpy(&r2, src, 4);
+ _mm_stream_si32(dst, r2);
+ src = RTE_PTR_ADD(src, 4);
+ dst = RTE_PTR_ADD(dst, 4);
+ len -= 4;
+ }
+ } else {
+ /* Destination is not 4 byte aligned. */
+ /* Copy until destination pointer is 16 byte aligned. */
+ rte_mov15_or_less(dst, src, first);
+ src = RTE_PTR_ADD(src, first);
+ dst = RTE_PTR_ADD(dst, first);
+ len -= first;
+ }
+ }
+ }
+
+ /* Destination pointer is now 16 byte aligned. */
+ RTE_ASSERT(rte_is_aligned(dst, 16));
+
+ /* Copy large portion of data in chunks of 64 byte. */
+ while (len >= 64) {
+ xmm0 = _mm_loadu_si128(RTE_PTR_ADD(src, 0 * 16));
+ xmm1 = _mm_loadu_si128(RTE_PTR_ADD(src, 1 * 16));
+ xmm2 = _mm_loadu_si128(RTE_PTR_ADD(src, 2 * 16));
+ xmm3 = _mm_loadu_si128(RTE_PTR_ADD(src, 3 * 16));
+ _mm_stream_si128(RTE_PTR_ADD(dst, 0 * 16), xmm0);
+ _mm_stream_si128(RTE_PTR_ADD(dst, 1 * 16), xmm1);
+ _mm_stream_si128(RTE_PTR_ADD(dst, 2 * 16), xmm2);
+ _mm_stream_si128(RTE_PTR_ADD(dst, 3 * 16), xmm3);
+ src = RTE_PTR_ADD(src, 64);
+ dst = RTE_PTR_ADD(dst, 64);
+ len -= 64;
+ }
+
+ /* Copy following 32 and 16 byte portions of data.
+ *
+ * Omitted if destination is known to be 16 byte aligned (so the alignment
+ * flags are still valid)
+ * and length is known to be respectively 64 or 32 byte aligned.
+ */
+ if (!(((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST16A) &&
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN64A)) &&
+ (len & 32)) {
+ xmm0 = _mm_loadu_si128(RTE_PTR_ADD(src, 0 * 16));
+ xmm1 = _mm_loadu_si128(RTE_PTR_ADD(src, 1 * 16));
+ _mm_stream_si128(RTE_PTR_ADD(dst, 0 * 16), xmm0);
+ _mm_stream_si128(RTE_PTR_ADD(dst, 1 * 16), xmm1);
+ src = RTE_PTR_ADD(src, 32);
+ dst = RTE_PTR_ADD(dst, 32);
+ }
+ if (!(((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST16A) &&
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN32A)) &&
+ (len & 16)) {
+ xmm2 = _mm_loadu_si128(src);
+ _mm_stream_si128(dst, xmm2);
+ src = RTE_PTR_ADD(src, 16);
+ dst = RTE_PTR_ADD(dst, 16);
+ }
+ } else {
+ /* Length <= 15, and
+ * destination is not known to be 16 byte aligned (but might be).
+ */
+ /* If destination is not 4 byte aligned, then
+ * use normal copy and return.
+ *
+ * Omitted if destination is known to be 4 byte aligned.
+ */
+ if (!((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST4A) &&
+ !rte_is_aligned(dst, 4)) {
+ /* Destination is not 4 byte aligned. Non-temporal store is unavailable. */
+ rte_mov15_or_less(dst, src, len);
+ return;
+ }
+ /* Destination is (known to be) 4 byte aligned. Proceed. */
+ }
+
+ /* Destination pointer is now 4 byte (or 16 byte) aligned. */
+ RTE_ASSERT(rte_is_aligned(dst, 4));
+
+ /* Copy following 8 and 4 byte portions of data.
+ *
+ * Omitted if destination is known to be 16 byte aligned (so the alignment
+ * flags are still valid)
+ * and length is known to be respectively 16 or 8 byte aligned.
+ */
+ if (!(((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST16A) &&
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN16A)) &&
+ (len & 8)) {
+ int32_t r0, r1;
+
+ memcpy(&r0, RTE_PTR_ADD(src, 0 * 4), 4);
+ memcpy(&r1, RTE_PTR_ADD(src, 1 * 4), 4);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 0 * 4), r0);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 1 * 4), r1);
+ src = RTE_PTR_ADD(src, 8);
+ dst = RTE_PTR_ADD(dst, 8);
+ }
+ if (!(((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST16A) &&
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN8A)) &&
+ (len & 4)) {
+ int32_t r2;
+
+ memcpy(&r2, src, 4);
+ _mm_stream_si32(dst, r2);
+ src = RTE_PTR_ADD(src, 4);
+ dst = RTE_PTR_ADD(dst, 4);
+ }
+
+ /* Copy remaining 2 and 1 byte portions of data.
+ *
+ * Omitted if destination is known to be 16 byte aligned (so the alignment
+ * flags are still valid)
+ * and length is known to be respectively 4 and 2 byte aligned.
+ */
+ if (!(((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST16A) &&
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN4A)) &&
+ (len & 2)) {
+ int16_t r3;
+
+ memcpy(&r3, src, 2);
+ *(int16_t *)dst = r3;
+ src = RTE_PTR_ADD(src, 2);
+ dst = RTE_PTR_ADD(dst, 2);
+ }
+ if (!(((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST16A) &&
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN2A)) &&
+ (len & 1))
+ *(char *)dst = *(const char *)src;
+}
+
+/**
+ * @internal
+ * Non-temporal memory copy of 15 or less byte
+ * from 16 byte aligned source via bounce buffer.
+ * The memory areas must not overlap.
+ *
+ * @param dst
+ * Pointer to the non-temporal destination memory area.
+ * @param src
+ * Pointer to the non-temporal source memory area.
+ * Must be 16 byte aligned.
+ * @param len
+ * Only the 4 least significant bits of this parameter are used.
+ * The 4 least significant bits of this holds the number of remaining bytes to copy.
+ * @param flags
+ * Hints for memory access.
+ */
+__rte_internal
+static __rte_always_inline
+__attribute__((__nonnull__(1, 2)))
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
+__attribute__((__access__(write_only, 1, 3), __access__(read_only, 2, 3)))
+#endif
+void rte_memcpy_nt_15_or_less_s16a(void *__rte_restrict dst,
+ const void *__rte_restrict src, size_t len, const uint64_t flags)
+{
+ int32_t buffer[4] __rte_aligned(16);
+ register __m128i xmm0;
+
+#ifndef RTE_TOOLCHAIN_CLANG /* Clang doesn't support using __builtin_constant_p() like this. */
+ RTE_BUILD_BUG_ON(!__builtin_constant_p(flags));
+#endif /* !RTE_TOOLCHAIN_CLANG */
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_DSTA_MASK) || rte_is_aligned(dst,
+ (flags & RTE_MEMOPS_F_DSTA_MASK) >> RTE_MEMOPS_F_DSTA_SHIFT));
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_SRCA_MASK) || rte_is_aligned(src,
+ (flags & RTE_MEMOPS_F_SRCA_MASK) >> RTE_MEMOPS_F_SRCA_SHIFT));
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_LENA_MASK) || (len &
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >> RTE_MEMOPS_F_LENA_SHIFT) - 1) == 0);
+
+ RTE_ASSERT((flags & (RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_DST_NT)) ==
+ (RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_DST_NT));
+ RTE_ASSERT(rte_is_aligned(src, 16));
+
+ if ((len & 15) == 0) return;
+
+ /* Non-temporal load into bounce buffer. */
+ xmm0 = _mm_stream_load_si128_const(src);
+ _mm_store_si128((void *)buffer, xmm0);
+
+ /* Store from bounce buffer. */
+ if (((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST4A) ||
+ rte_is_aligned(dst, 4)) {
+ /* Destination is (known to be) 4 byte aligned. */
+ src = (const void *)buffer;
+ if (len & 8) {
+ if ((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST8A) {
+ /* Destination is known to be 8 byte aligned. */
+ _mm_stream_si64(dst, *(const int64_t *)src);
+ } else {
+ _mm_stream_si32(RTE_PTR_ADD(dst, 0), buffer[0]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 4), buffer[1]);
+ }
+ src = RTE_PTR_ADD(src, 8);
+ dst = RTE_PTR_ADD(dst, 8);
+ }
+ if (!((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN8A) &&
+ (len & 4)) {
+ _mm_stream_si32(dst, *(const int32_t *)src);
+ src = RTE_PTR_ADD(src, 4);
+ dst = RTE_PTR_ADD(dst, 4);
+ }
+
+ /* Non-temporal store is unavailble for the remaining 3 byte or less. */
+ if (!((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN4A) &&
+ (len & 2)) {
+ *(int16_t *)dst = *(const int16_t *)src;
+ src = RTE_PTR_ADD(src, 2);
+ dst = RTE_PTR_ADD(dst, 2);
+ }
+ if (!((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN2A) &&
+ (len & 1)) {
+ *(char *)dst = *(const char *)src;
+ }
+ } else {
+ /* Destination is not 4 byte aligned. Non-temporal store is unavailable. */
+ rte_mov15_or_less(dst, (const void *)buffer, len & 15);
+ }
+}
+
+/**
+ * @internal
+ * 16 byte aligned addresses non-temporal memory copy.
+ * The memory areas must not overlap.
+ *
+ * @param dst
+ * Pointer to the non-temporal destination memory area.
+ * Must be 16 byte aligned.
+ * @param src
+ * Pointer to the non-temporal source memory area.
+ * Must be 16 byte aligned.
+ * @param len
+ * Number of bytes to copy.
+ * @param flags
+ * Hints for memory access.
+ */
+__rte_internal
+static __rte_always_inline
+__attribute__((__nonnull__(1, 2)))
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
+__attribute__((__access__(write_only, 1, 3), __access__(read_only, 2, 3)))
+#endif
+void rte_memcpy_nt_d16s16a(void *__rte_restrict dst, const void *__rte_restrict src, size_t len,
+ const uint64_t flags)
+{
+ register __m128i xmm0, xmm1, xmm2, xmm3;
+
+#ifndef RTE_TOOLCHAIN_CLANG /* Clang doesn't support using __builtin_constant_p() like this. */
+ RTE_BUILD_BUG_ON(!__builtin_constant_p(flags));
+#endif /* !RTE_TOOLCHAIN_CLANG */
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_DSTA_MASK) || rte_is_aligned(dst,
+ (flags & RTE_MEMOPS_F_DSTA_MASK) >> RTE_MEMOPS_F_DSTA_SHIFT));
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_SRCA_MASK) || rte_is_aligned(src,
+ (flags & RTE_MEMOPS_F_SRCA_MASK) >> RTE_MEMOPS_F_SRCA_SHIFT));
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_LENA_MASK) || (len &
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >> RTE_MEMOPS_F_LENA_SHIFT) - 1) == 0);
+
+ RTE_ASSERT((flags & (RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_DST_NT)) ==
+ (RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_DST_NT));
+ RTE_ASSERT(rte_is_aligned(dst, 16));
+ RTE_ASSERT(rte_is_aligned(src, 16));
+
+ if (unlikely(len == 0))
+ return;
+
+ /* Copy large portion of data in chunks of 64 byte. */
+ while (len >= 64) {
+ xmm0 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 0 * 16));
+ xmm1 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 1 * 16));
+ xmm2 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 2 * 16));
+ xmm3 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 3 * 16));
+ _mm_stream_si128(RTE_PTR_ADD(dst, 0 * 16), xmm0);
+ _mm_stream_si128(RTE_PTR_ADD(dst, 1 * 16), xmm1);
+ _mm_stream_si128(RTE_PTR_ADD(dst, 2 * 16), xmm2);
+ _mm_stream_si128(RTE_PTR_ADD(dst, 3 * 16), xmm3);
+ src = RTE_PTR_ADD(src, 64);
+ dst = RTE_PTR_ADD(dst, 64);
+ len -= 64;
+ }
+
+ /* Copy following 32 and 16 byte portions of data.
+ *
+ * Omitted if length is known to be respectively 64 or 32 byte aligned.
+ */
+ if (!((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN64A) &&
+ (len & 32)) {
+ xmm0 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 0 * 16));
+ xmm1 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 1 * 16));
+ _mm_stream_si128(RTE_PTR_ADD(dst, 0 * 16), xmm0);
+ _mm_stream_si128(RTE_PTR_ADD(dst, 1 * 16), xmm1);
+ src = RTE_PTR_ADD(src, 32);
+ dst = RTE_PTR_ADD(dst, 32);
+ }
+ if (!((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN32A) &&
+ (len & 16)) {
+ xmm2 = _mm_stream_load_si128_const(src);
+ _mm_stream_si128(dst, xmm2);
+ src = RTE_PTR_ADD(src, 16);
+ dst = RTE_PTR_ADD(dst, 16);
+ }
+
+ /* Copy remaining data, 15 byte or less, via bounce buffer.
+ *
+ * Omitted if length is known to be 16 byte aligned.
+ */
+ if (!((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN16A))
+ rte_memcpy_nt_15_or_less_s16a(dst, src, len,
+ (flags & ~(RTE_MEMOPS_F_DSTA_MASK | RTE_MEMOPS_F_SRCA_MASK)) |
+ (((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST16A) ?
+ flags : RTE_MEMOPS_F_DST16A) |
+ (((flags & RTE_MEMOPS_F_SRCA_MASK) >= RTE_MEMOPS_F_SRC16A) ?
+ flags : RTE_MEMOPS_F_SRC16A));
+}
+
+/**
+ * @internal
+ * 8/16 byte aligned destination/source addresses non-temporal memory copy.
+ * The memory areas must not overlap.
+ *
+ * @param dst
+ * Pointer to the non-temporal destination memory area.
+ * Must be 8 byte aligned.
+ * @param src
+ * Pointer to the non-temporal source memory area.
+ * Must be 16 byte aligned.
+ * @param len
+ * Number of bytes to copy.
+ * @param flags
+ * Hints for memory access.
+ */
+__rte_internal
+static __rte_always_inline
+__attribute__((__nonnull__(1, 2)))
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
+__attribute__((__access__(write_only, 1, 3), __access__(read_only, 2, 3)))
+#endif
+void rte_memcpy_nt_d8s16a(void *__rte_restrict dst, const void *__rte_restrict src, size_t len,
+ const uint64_t flags)
+{
+ int64_t buffer[8] __rte_cache_aligned /* at least __rte_aligned(16) */;
+ register __m128i xmm0, xmm1, xmm2, xmm3;
+
+#ifndef RTE_TOOLCHAIN_CLANG /* Clang doesn't support using __builtin_constant_p() like this. */
+ RTE_BUILD_BUG_ON(!__builtin_constant_p(flags));
+#endif /* !RTE_TOOLCHAIN_CLANG */
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_DSTA_MASK) || rte_is_aligned(dst,
+ (flags & RTE_MEMOPS_F_DSTA_MASK) >> RTE_MEMOPS_F_DSTA_SHIFT));
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_SRCA_MASK) || rte_is_aligned(src,
+ (flags & RTE_MEMOPS_F_SRCA_MASK) >> RTE_MEMOPS_F_SRCA_SHIFT));
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_LENA_MASK) || (len &
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >> RTE_MEMOPS_F_LENA_SHIFT) - 1) == 0);
+
+ RTE_ASSERT((flags & (RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_DST_NT)) ==
+ (RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_DST_NT));
+ RTE_ASSERT(rte_is_aligned(dst, 8));
+ RTE_ASSERT(rte_is_aligned(src, 16));
+
+ if (unlikely(len == 0))
+ return;
+
+ /* Copy large portion of data in chunks of 64 byte. */
+ while (len >= 64) {
+ xmm0 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 0 * 16));
+ xmm1 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 1 * 16));
+ xmm2 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 2 * 16));
+ xmm3 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 3 * 16));
+ _mm_store_si128((void *)&buffer[0 * 2], xmm0);
+ _mm_store_si128((void *)&buffer[1 * 2], xmm1);
+ _mm_store_si128((void *)&buffer[2 * 2], xmm2);
+ _mm_store_si128((void *)&buffer[3 * 2], xmm3);
+ _mm_stream_si64(RTE_PTR_ADD(dst, 0 * 8), buffer[0]);
+ _mm_stream_si64(RTE_PTR_ADD(dst, 1 * 8), buffer[1]);
+ _mm_stream_si64(RTE_PTR_ADD(dst, 2 * 8), buffer[2]);
+ _mm_stream_si64(RTE_PTR_ADD(dst, 3 * 8), buffer[3]);
+ _mm_stream_si64(RTE_PTR_ADD(dst, 4 * 8), buffer[4]);
+ _mm_stream_si64(RTE_PTR_ADD(dst, 5 * 8), buffer[5]);
+ _mm_stream_si64(RTE_PTR_ADD(dst, 6 * 8), buffer[6]);
+ _mm_stream_si64(RTE_PTR_ADD(dst, 7 * 8), buffer[7]);
+ src = RTE_PTR_ADD(src, 64);
+ dst = RTE_PTR_ADD(dst, 64);
+ len -= 64;
+ }
+
+ /* Copy following 32 and 16 byte portions of data.
+ *
+ * Omitted if length is known to be respectively 64 or 32 byte aligned.
+ */
+ if (!((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN64A) &&
+ (len & 32)) {
+ xmm0 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 0 * 16));
+ xmm1 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 1 * 16));
+ _mm_store_si128((void *)&buffer[0 * 2], xmm0);
+ _mm_store_si128((void *)&buffer[1 * 2], xmm1);
+ _mm_stream_si64(RTE_PTR_ADD(dst, 0 * 8), buffer[0]);
+ _mm_stream_si64(RTE_PTR_ADD(dst, 1 * 8), buffer[1]);
+ _mm_stream_si64(RTE_PTR_ADD(dst, 2 * 8), buffer[2]);
+ _mm_stream_si64(RTE_PTR_ADD(dst, 3 * 8), buffer[3]);
+ src = RTE_PTR_ADD(src, 32);
+ dst = RTE_PTR_ADD(dst, 32);
+ }
+ if (!((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN32A) &&
+ (len & 16)) {
+ xmm2 = _mm_stream_load_si128_const(src);
+ _mm_store_si128((void *)&buffer[2 * 2], xmm2);
+ _mm_stream_si64(RTE_PTR_ADD(dst, 0 * 8), buffer[4]);
+ _mm_stream_si64(RTE_PTR_ADD(dst, 1 * 8), buffer[5]);
+ src = RTE_PTR_ADD(src, 16);
+ dst = RTE_PTR_ADD(dst, 16);
+ }
+
+ /* Copy remaining data, 15 byte or less, via bounce buffer.
+ *
+ * Omitted if length is known to be 16 byte aligned.
+ */
+ if (!((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN16A))
+ rte_memcpy_nt_15_or_less_s16a(dst, src, len,
+ (flags & ~(RTE_MEMOPS_F_DSTA_MASK | RTE_MEMOPS_F_SRCA_MASK)) |
+ (((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST8A) ?
+ flags : RTE_MEMOPS_F_DST8A) |
+ (((flags & RTE_MEMOPS_F_SRCA_MASK) >= RTE_MEMOPS_F_SRC16A) ?
+ flags : RTE_MEMOPS_F_SRC16A));
+}
+
+/**
+ * @internal
+ * 4/16 byte aligned destination/source addresses non-temporal memory copy.
+ * The memory areas must not overlap.
+ *
+ * @param dst
+ * Pointer to the non-temporal destination memory area.
+ * Must be 4 byte aligned.
+ * @param src
+ * Pointer to the non-temporal source memory area.
+ * Must be 16 byte aligned.
+ * @param len
+ * Number of bytes to copy.
+ * @param flags
+ * Hints for memory access.
+ */
+__rte_internal
+static __rte_always_inline
+__attribute__((__nonnull__(1, 2)))
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
+__attribute__((__access__(write_only, 1, 3), __access__(read_only, 2, 3)))
+#endif
+void rte_memcpy_nt_d4s16a(void *__rte_restrict dst, const void *__rte_restrict src, size_t len,
+ const uint64_t flags)
+{
+ int32_t buffer[16] __rte_cache_aligned /* at least __rte_aligned(16) */;
+ register __m128i xmm0, xmm1, xmm2, xmm3;
+
+#ifndef RTE_TOOLCHAIN_CLANG /* Clang doesn't support using __builtin_constant_p() like this. */
+ RTE_BUILD_BUG_ON(!__builtin_constant_p(flags));
+#endif /* !RTE_TOOLCHAIN_CLANG */
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_DSTA_MASK) || rte_is_aligned(dst,
+ (flags & RTE_MEMOPS_F_DSTA_MASK) >> RTE_MEMOPS_F_DSTA_SHIFT));
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_SRCA_MASK) || rte_is_aligned(src,
+ (flags & RTE_MEMOPS_F_SRCA_MASK) >> RTE_MEMOPS_F_SRCA_SHIFT));
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_LENA_MASK) || (len &
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >> RTE_MEMOPS_F_LENA_SHIFT) - 1) == 0);
+
+ RTE_ASSERT((flags & (RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_DST_NT)) ==
+ (RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_DST_NT));
+ RTE_ASSERT(rte_is_aligned(dst, 4));
+ RTE_ASSERT(rte_is_aligned(src, 16));
+
+ if (unlikely(len == 0))
+ return;
+
+ /* Copy large portion of data in chunks of 64 byte. */
+ while (len >= 64) {
+ xmm0 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 0 * 16));
+ xmm1 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 1 * 16));
+ xmm2 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 2 * 16));
+ xmm3 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 3 * 16));
+ _mm_store_si128((void *)&buffer[0 * 4], xmm0);
+ _mm_store_si128((void *)&buffer[1 * 4], xmm1);
+ _mm_store_si128((void *)&buffer[2 * 4], xmm2);
+ _mm_store_si128((void *)&buffer[3 * 4], xmm3);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 0 * 4), buffer[0]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 1 * 4), buffer[1]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 2 * 4), buffer[2]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 3 * 4), buffer[3]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 4 * 4), buffer[4]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 5 * 4), buffer[5]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 6 * 4), buffer[6]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 7 * 4), buffer[7]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 8 * 4), buffer[8]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 9 * 4), buffer[9]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 10 * 4), buffer[10]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 11 * 4), buffer[11]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 12 * 4), buffer[12]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 13 * 4), buffer[13]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 14 * 4), buffer[14]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 15 * 4), buffer[15]);
+ src = RTE_PTR_ADD(src, 64);
+ dst = RTE_PTR_ADD(dst, 64);
+ len -= 64;
+ }
+
+ /* Copy following 32 and 16 byte portions of data.
+ *
+ * Omitted if length is known to be respectively 64 or 32 byte aligned.
+ */
+ if (!((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN64A) &&
+ (len & 32)) {
+ xmm0 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 0 * 16));
+ xmm1 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 1 * 16));
+ _mm_store_si128((void *)&buffer[0 * 4], xmm0);
+ _mm_store_si128((void *)&buffer[1 * 4], xmm1);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 0 * 4), buffer[0]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 1 * 4), buffer[1]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 2 * 4), buffer[2]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 3 * 4), buffer[3]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 4 * 4), buffer[4]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 5 * 4), buffer[5]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 6 * 4), buffer[6]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 7 * 4), buffer[7]);
+ src = RTE_PTR_ADD(src, 32);
+ dst = RTE_PTR_ADD(dst, 32);
+ }
+ if (!((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN32A) &&
+ (len & 16)) {
+ xmm2 = _mm_stream_load_si128_const(src);
+ _mm_store_si128((void *)&buffer[2 * 4], xmm2);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 0 * 4), buffer[8]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 1 * 4), buffer[9]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 2 * 4), buffer[10]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 3 * 4), buffer[11]);
+ src = RTE_PTR_ADD(src, 16);
+ dst = RTE_PTR_ADD(dst, 16);
+ }
+
+ /* Copy remaining data, 15 byte or less, via bounce buffer.
+ *
+ * Omitted if length is known to be 16 byte aligned.
+ */
+ if (!((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN16A))
+ rte_memcpy_nt_15_or_less_s16a(dst, src, len,
+ (flags & ~(RTE_MEMOPS_F_DSTA_MASK | RTE_MEMOPS_F_SRCA_MASK)) |
+ (((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST4A) ?
+ flags : RTE_MEMOPS_F_DST4A) |
+ (((flags & RTE_MEMOPS_F_SRCA_MASK) >= RTE_MEMOPS_F_SRC16A) ?
+ flags : RTE_MEMOPS_F_SRC16A));
+}
+
+/**
+ * @internal
+ * 4 byte aligned addresses (non-temporal) memory copy.
+ * The memory areas must not overlap.
+ *
+ * @param dst
+ * Pointer to the (non-temporal) destination memory area.
+ * Must be 4 byte aligned if using non-temporal store.
+ * @param src
+ * Pointer to the (non-temporal) source memory area.
+ * Must be 4 byte aligned if using non-temporal load.
+ * @param len
+ * Number of bytes to copy.
+ * @param flags
+ * Hints for memory access.
+ */
+__rte_internal
+static __rte_always_inline
+__attribute__((__nonnull__(1, 2)))
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
+__attribute__((__access__(write_only, 1, 3), __access__(read_only, 2, 3)))
+#endif
+void rte_memcpy_nt_d4s4a(void *__rte_restrict dst, const void *__rte_restrict src, size_t len,
+ const uint64_t flags)
+{
+ /** How many bytes is source offset from 16 byte alignment (floor rounding). */
+ const size_t offset = (flags & RTE_MEMOPS_F_SRCA_MASK) >= RTE_MEMOPS_F_SRC16A ?
+ 0 : (uintptr_t)src & 15;
+
+#ifndef RTE_TOOLCHAIN_CLANG /* Clang doesn't support using __builtin_constant_p() like this. */
+ RTE_BUILD_BUG_ON(!__builtin_constant_p(flags));
+#endif /* !RTE_TOOLCHAIN_CLANG */
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_DSTA_MASK) || rte_is_aligned(dst,
+ (flags & RTE_MEMOPS_F_DSTA_MASK) >> RTE_MEMOPS_F_DSTA_SHIFT));
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_SRCA_MASK) || rte_is_aligned(src,
+ (flags & RTE_MEMOPS_F_SRCA_MASK) >> RTE_MEMOPS_F_SRCA_SHIFT));
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_LENA_MASK) || (len &
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >> RTE_MEMOPS_F_LENA_SHIFT) - 1) == 0);
+
+ RTE_ASSERT((flags & (RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_DST_NT)) ==
+ (RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_DST_NT));
+ RTE_ASSERT(rte_is_aligned(dst, 4));
+ RTE_ASSERT(rte_is_aligned(src, 4));
+
+ if (unlikely(len == 0))
+ return;
+
+ if (offset == 0) {
+ /* Source is 16 byte aligned. */
+ /* Copy everything, using upgraded source alignment flags. */
+ rte_memcpy_nt_d4s16a(dst, src, len,
+ (flags & ~RTE_MEMOPS_F_SRCA_MASK) | RTE_MEMOPS_F_SRC16A);
+ } else {
+ /* Source is not 16 byte aligned, so make it 16 byte aligned. */
+ int32_t buffer[4] __rte_aligned(16);
+ const size_t first = 16 - offset;
+ register __m128i xmm0;
+
+ /* First, copy first part of data in chunks of 4 byte,
+ * to achieve 16 byte alignment of source.
+ * This invalidates the source, destination and length alignment flags, and
+ * potentially makes the destination pointer 16 byte unaligned/aligned.
+ */
+
+ /** Copy from 16 byte aligned source pointer (floor rounding). */
+ xmm0 = _mm_stream_load_si128_const(RTE_PTR_SUB(src, offset));
+ _mm_store_si128((void *)buffer, xmm0);
+
+ if (unlikely(len + offset <= 16)) {
+ /* Short length. */
+ if (((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN4A) ||
+ (len & 3) == 0) {
+ /* Length is 4 byte aligned. */
+ switch (len) {
+ case 1 * 4:
+ /* Offset can be 1 * 4, 2 * 4 or 3 * 4. */
+ _mm_stream_si32(RTE_PTR_ADD(dst, 0 * 4),
+ buffer[offset / 4]);
+ break;
+ case 2 * 4:
+ /* Offset can be 1 * 4 or 2 * 4. */
+ _mm_stream_si32(RTE_PTR_ADD(dst, 0 * 4),
+ buffer[offset / 4]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 1 * 4),
+ buffer[offset / 4 + 1]);
+ break;
+ case 3 * 4:
+ /* Offset can only be 1 * 4. */
+ _mm_stream_si32(RTE_PTR_ADD(dst, 0 * 4), buffer[1]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 1 * 4), buffer[2]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 2 * 4), buffer[3]);
+ break;
+ }
+ } else {
+ /* Length is not 4 byte aligned. */
+ rte_mov15_or_less(dst, RTE_PTR_ADD(buffer, offset), len);
+ }
+ return;
+ }
+
+ switch (first) {
+ case 1 * 4:
+ _mm_stream_si32(RTE_PTR_ADD(dst, 0 * 4), buffer[3]);
+ break;
+ case 2 * 4:
+ _mm_stream_si32(RTE_PTR_ADD(dst, 0 * 4), buffer[2]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 1 * 4), buffer[3]);
+ break;
+ case 3 * 4:
+ _mm_stream_si32(RTE_PTR_ADD(dst, 0 * 4), buffer[1]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 1 * 4), buffer[2]);
+ _mm_stream_si32(RTE_PTR_ADD(dst, 2 * 4), buffer[3]);
+ break;
+ }
+
+ src = RTE_PTR_ADD(src, first);
+ dst = RTE_PTR_ADD(dst, first);
+ len -= first;
+
+ /* Source pointer is now 16 byte aligned. */
+ RTE_ASSERT(rte_is_aligned(src, 16));
+
+ /* Then, copy the rest, using corrected alignment flags. */
+ if (rte_is_aligned(dst, 16))
+ rte_memcpy_nt_d16s16a(dst, src, len, (flags &
+ ~(RTE_MEMOPS_F_DSTA_MASK | RTE_MEMOPS_F_SRCA_MASK |
+ RTE_MEMOPS_F_LENA_MASK)) |
+ RTE_MEMOPS_F_DST16A | RTE_MEMOPS_F_SRC16A |
+ (((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN4A) ?
+ RTE_MEMOPS_F_LEN4A : (flags & RTE_MEMOPS_F_LEN2A)));
+ else if (rte_is_aligned(dst, 8))
+ rte_memcpy_nt_d8s16a(dst, src, len, (flags &
+ ~(RTE_MEMOPS_F_DSTA_MASK | RTE_MEMOPS_F_SRCA_MASK |
+ RTE_MEMOPS_F_LENA_MASK)) |
+ RTE_MEMOPS_F_DST8A | RTE_MEMOPS_F_SRC16A |
+ (((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN4A) ?
+ RTE_MEMOPS_F_LEN4A : (flags & RTE_MEMOPS_F_LEN2A)));
+ else
+ rte_memcpy_nt_d4s16a(dst, src, len, (flags &
+ ~(RTE_MEMOPS_F_DSTA_MASK | RTE_MEMOPS_F_SRCA_MASK |
+ RTE_MEMOPS_F_LENA_MASK)) |
+ RTE_MEMOPS_F_DST4A | RTE_MEMOPS_F_SRC16A |
+ (((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN4A) ?
+ RTE_MEMOPS_F_LEN4A : (flags & RTE_MEMOPS_F_LEN2A)));
+ }
+}
+
+#ifndef RTE_MEMCPY_NT_BUFSIZE
+
+#include <lib/mbuf/rte_mbuf_core.h>
+
+/** Bounce buffer size for non-temporal memcpy.
+ *
+ * Must be 2^N and >= 128.
+ * The actual buffer will be slightly larger, due to added padding.
+ * The default is chosen to be able to handle a non-segmented packet.
+ */
+#define RTE_MEMCPY_NT_BUFSIZE RTE_MBUF_DEFAULT_DATAROOM
+
+#endif /* RTE_MEMCPY_NT_BUFSIZE */
+
+/**
+ * @internal
+ * Non-temporal memory copy via bounce buffer.
+ *
+ * @note
+ * If the destination and/or length is unaligned, the first and/or last copied
+ * bytes will be stored in the destination memory area using temporal access.
+ *
+ * @param dst
+ * Pointer to the non-temporal destination memory area.
+ * @param src
+ * Pointer to the non-temporal source memory area.
+ * @param len
+ * Number of bytes to copy.
+ * Must be <= RTE_MEMCPY_NT_BUFSIZE.
+ * @param flags
+ * Hints for memory access.
+ */
+__rte_internal
+static __rte_always_inline
+__attribute__((__nonnull__(1, 2)))
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
+__attribute__((__access__(write_only, 1, 3), __access__(read_only, 2, 3)))
+#endif
+void rte_memcpy_nt_buf(void *__rte_restrict dst, const void *__rte_restrict src, size_t len,
+ const uint64_t flags)
+{
+ /** Cache line aligned bounce buffer with preceding and trailing padding.
+ *
+ * The preceding padding is one cache line, so the data area itself
+ * is cache line aligned.
+ * The trailing padding is 16 bytes, leaving room for the trailing bytes
+ * of a 16 byte store operation.
+ */
+ char buffer[RTE_CACHE_LINE_SIZE + RTE_MEMCPY_NT_BUFSIZE + 16]
+ __rte_cache_aligned;
+ /** Pointer to bounce buffer's aligned data area. */
+ char * const buf0 = &buffer[RTE_CACHE_LINE_SIZE];
+ void *buf;
+ /** Number of bytes to copy from source, incl. any extra preceding bytes. */
+ size_t srclen;
+ register __m128i xmm0, xmm1, xmm2, xmm3;
+
+#ifndef RTE_TOOLCHAIN_CLANG /* Clang doesn't support using __builtin_constant_p() like this. */
+ RTE_BUILD_BUG_ON(!__builtin_constant_p(flags));
+#endif /* !RTE_TOOLCHAIN_CLANG */
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_DSTA_MASK) || rte_is_aligned(dst,
+ (flags & RTE_MEMOPS_F_DSTA_MASK) >> RTE_MEMOPS_F_DSTA_SHIFT));
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_SRCA_MASK) || rte_is_aligned(src,
+ (flags & RTE_MEMOPS_F_SRCA_MASK) >> RTE_MEMOPS_F_SRCA_SHIFT));
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_LENA_MASK) || (len &
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >> RTE_MEMOPS_F_LENA_SHIFT) - 1) == 0);
+
+ RTE_ASSERT((flags & (RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_DST_NT)) ==
+ (RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_DST_NT));
+ RTE_ASSERT(len <= RTE_MEMCPY_NT_BUFSIZE);
+
+ if (unlikely(len == 0))
+ return;
+
+ /* Step 1:
+ * Copy data from the source to the bounce buffer's aligned data area,
+ * using aligned non-temporal load from the source,
+ * and unaligned store in the bounce buffer.
+ *
+ * If the source is unaligned, the additional bytes preceding the data will be copied
+ * to the padding area preceding the bounce buffer's aligned data area.
+ * Similarly, if the source data ends at an unaligned address, the additional bytes
+ * trailing the data will be copied to the padding area trailing the bounce buffer's
+ * aligned data area.
+ */
+
+ /* Adjust for extra preceding bytes, unless source is known to be 16 byte aligned. */
+ if ((flags & RTE_MEMOPS_F_SRCA_MASK) >= RTE_MEMOPS_F_SRC16A) {
+ buf = buf0;
+ srclen = len;
+ } else {
+ /** How many bytes is source offset from 16 byte alignment (floor rounding). */
+ const size_t offset = (uintptr_t)src & 15;
+
+ buf = RTE_PTR_SUB(buf0, offset);
+ src = RTE_PTR_SUB(src, offset);
+ srclen = len + offset;
+ }
+
+ /* Copy large portion of data from source to bounce buffer in chunks of 64 byte. */
+ while (srclen >= 64) {
+ xmm0 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 0 * 16));
+ xmm1 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 1 * 16));
+ xmm2 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 2 * 16));
+ xmm3 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 3 * 16));
+ _mm_storeu_si128(RTE_PTR_ADD(buf, 0 * 16), xmm0);
+ _mm_storeu_si128(RTE_PTR_ADD(buf, 1 * 16), xmm1);
+ _mm_storeu_si128(RTE_PTR_ADD(buf, 2 * 16), xmm2);
+ _mm_storeu_si128(RTE_PTR_ADD(buf, 3 * 16), xmm3);
+ src = RTE_PTR_ADD(src, 64);
+ buf = RTE_PTR_ADD(buf, 64);
+ srclen -= 64;
+ }
+
+ /* Copy remaining 32 and 16 byte portions of data from source to bounce buffer.
+ *
+ * Omitted if source is known to be 16 byte aligned (so the length alignment
+ * flags are still valid)
+ * and length is known to be respectively 64 or 32 byte aligned.
+ */
+ if (!(((flags & RTE_MEMOPS_F_SRCA_MASK) >= RTE_MEMOPS_F_SRC16A) &&
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN64A)) &&
+ (srclen & 32)) {
+ xmm0 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 0 * 16));
+ xmm1 = _mm_stream_load_si128_const(RTE_PTR_ADD(src, 1 * 16));
+ _mm_storeu_si128(RTE_PTR_ADD(buf, 0 * 16), xmm0);
+ _mm_storeu_si128(RTE_PTR_ADD(buf, 1 * 16), xmm1);
+ src = RTE_PTR_ADD(src, 32);
+ buf = RTE_PTR_ADD(buf, 32);
+ }
+ if (!(((flags & RTE_MEMOPS_F_SRCA_MASK) >= RTE_MEMOPS_F_SRC16A) &&
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN32A)) &&
+ (srclen & 16)) {
+ xmm2 = _mm_stream_load_si128_const(src);
+ _mm_storeu_si128(buf, xmm2);
+ src = RTE_PTR_ADD(src, 16);
+ buf = RTE_PTR_ADD(buf, 16);
+ }
+ /* Copy any trailing bytes of data from source to bounce buffer.
+ *
+ * Omitted if source is known to be 16 byte aligned (so the length alignment
+ * flags are still valid)
+ * and length is known to be 16 byte aligned.
+ */
+ if (!(((flags & RTE_MEMOPS_F_SRCA_MASK) >= RTE_MEMOPS_F_SRC16A) &&
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >= RTE_MEMOPS_F_LEN16A)) &&
+ (srclen & 15)) {
+ xmm3 = _mm_stream_load_si128_const(src);
+ _mm_storeu_si128(buf, xmm3);
+ }
+
+ /* Step 2:
+ * Copy from the aligned bounce buffer to the non-temporal destination.
+ */
+ rte_memcpy_ntd(dst, buf0, len,
+ (flags & ~(RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_SRCA_MASK)) |
+ (RTE_CACHE_LINE_SIZE << RTE_MEMOPS_F_SRCA_SHIFT));
+}
+
+/**
+ * @internal
+ * Non-temporal memory copy.
+ * The memory areas must not overlap.
+ *
+ * @note
+ * If the destination and/or length is unaligned, some copied bytes will be
+ * stored in the destination memory area using temporal access.
+ *
+ * @param dst
+ * Pointer to the non-temporal destination memory area.
+ * @param src
+ * Pointer to the non-temporal source memory area.
+ * @param len
+ * Number of bytes to copy.
+ * @param flags
+ * Hints for memory access.
+ */
+__rte_internal
+static __rte_always_inline
+__attribute__((__nonnull__(1, 2)))
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
+__attribute__((__access__(write_only, 1, 3), __access__(read_only, 2, 3)))
+#endif
+void rte_memcpy_nt_generic(void *__rte_restrict dst, const void *__rte_restrict src, size_t len,
+ const uint64_t flags)
+{
+#ifndef RTE_TOOLCHAIN_CLANG /* Clang doesn't support using __builtin_constant_p() like this. */
+ RTE_BUILD_BUG_ON(!__builtin_constant_p(flags));
+#endif /* !RTE_TOOLCHAIN_CLANG */
+
+ while (len > RTE_MEMCPY_NT_BUFSIZE) {
+ rte_memcpy_nt_buf(dst, src, RTE_MEMCPY_NT_BUFSIZE,
+ (flags & ~RTE_MEMOPS_F_LENA_MASK) | RTE_MEMOPS_F_LEN128A);
+ dst = RTE_PTR_ADD(dst, RTE_MEMCPY_NT_BUFSIZE);
+ src = RTE_PTR_ADD(src, RTE_MEMCPY_NT_BUFSIZE);
+ len -= RTE_MEMCPY_NT_BUFSIZE;
+ }
+ rte_memcpy_nt_buf(dst, src, len, flags);
+}
+
+/* Implementation. Refer to function declaration for documentation. */
+__rte_experimental
+static __rte_always_inline
+__attribute__((__nonnull__(1, 2)))
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
+__attribute__((__access__(write_only, 1, 3), __access__(read_only, 2, 3)))
+#endif
+void rte_memcpy_ex(void *__rte_restrict dst, const void *__rte_restrict src, size_t len,
+ const uint64_t flags)
+{
+#ifndef RTE_TOOLCHAIN_CLANG /* Clang doesn't support using __builtin_constant_p() like this. */
+ RTE_BUILD_BUG_ON(!__builtin_constant_p(flags));
+#endif /* !RTE_TOOLCHAIN_CLANG */
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_DSTA_MASK) || rte_is_aligned(dst,
+ (flags & RTE_MEMOPS_F_DSTA_MASK) >> RTE_MEMOPS_F_DSTA_SHIFT));
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_SRCA_MASK) || rte_is_aligned(src,
+ (flags & RTE_MEMOPS_F_SRCA_MASK) >> RTE_MEMOPS_F_SRCA_SHIFT));
+ RTE_ASSERT(!(flags & RTE_MEMOPS_F_LENA_MASK) || (len &
+ ((flags & RTE_MEMOPS_F_LENA_MASK) >> RTE_MEMOPS_F_LENA_SHIFT) - 1) == 0);
+
+ if ((flags & (RTE_MEMOPS_F_DST_NT | RTE_MEMOPS_F_SRC_NT)) ==
+ (RTE_MEMOPS_F_DST_NT | RTE_MEMOPS_F_SRC_NT)) {
+ /* Copy between non-temporal source and destination. */
+ if ((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST16A &&
+ (flags & RTE_MEMOPS_F_SRCA_MASK) >= RTE_MEMOPS_F_SRC16A)
+ rte_memcpy_nt_d16s16a(dst, src, len, flags);
+ else if ((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST8A &&
+ (flags & RTE_MEMOPS_F_SRCA_MASK) >= RTE_MEMOPS_F_SRC16A)
+ rte_memcpy_nt_d8s16a(dst, src, len, flags);
+ else if ((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST4A &&
+ (flags & RTE_MEMOPS_F_SRCA_MASK) >= RTE_MEMOPS_F_SRC16A)
+ rte_memcpy_nt_d4s16a(dst, src, len, flags);
+ else if ((flags & RTE_MEMOPS_F_DSTA_MASK) >= RTE_MEMOPS_F_DST4A &&
+ (flags & RTE_MEMOPS_F_SRCA_MASK) >= RTE_MEMOPS_F_SRC4A)
+ rte_memcpy_nt_d4s4a(dst, src, len, flags);
+ else if (len <= RTE_MEMCPY_NT_BUFSIZE)
+ rte_memcpy_nt_buf(dst, src, len, flags);
+ else
+ rte_memcpy_nt_generic(dst, src, len, flags);
+ } else if (flags & RTE_MEMOPS_F_SRC_NT) {
+ /* Copy from non-temporal source. */
+ rte_memcpy_nts(dst, src, len, flags);
+ } else if (flags & RTE_MEMOPS_F_DST_NT) {
+ /* Copy to non-temporal destination. */
+ rte_memcpy_ntd(dst, src, len, flags);
+ } else
+ rte_memcpy(dst, src, len);
+}
+
#undef ALIGNMENT_MASK
#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
@@ -660,6 +660,83 @@ rte_pktmbuf_copy(const struct rte_mbuf *m, struct rte_mempool *mp,
return mc;
}
+/* Create a deep copy of mbuf, using non-temporal memory access */
+struct rte_mbuf *
+rte_pktmbuf_copy_ex(const struct rte_mbuf *m, struct rte_mempool *mp,
+ uint32_t off, uint32_t len, const uint64_t flags)
+{
+ const struct rte_mbuf *seg = m;
+ struct rte_mbuf *mc, *m_last, **prev;
+
+ /* garbage in check */
+ __rte_mbuf_sanity_check(m, 1);
+
+ /* check for request to copy at offset past end of mbuf */
+ if (unlikely(off >= m->pkt_len))
+ return NULL;
+
+ mc = rte_pktmbuf_alloc(mp);
+ if (unlikely(mc == NULL))
+ return NULL;
+
+ /* truncate requested length to available data */
+ if (len > m->pkt_len - off)
+ len = m->pkt_len - off;
+
+ __rte_pktmbuf_copy_hdr(mc, m);
+
+ /* copied mbuf is not indirect or external */
+ mc->ol_flags = m->ol_flags & ~(RTE_MBUF_F_INDIRECT|RTE_MBUF_F_EXTERNAL);
+
+ prev = &mc->next;
+ m_last = mc;
+ while (len > 0) {
+ uint32_t copy_len;
+
+ /* skip leading mbuf segments */
+ while (off >= seg->data_len) {
+ off -= seg->data_len;
+ seg = seg->next;
+ }
+
+ /* current buffer is full, chain a new one */
+ if (rte_pktmbuf_tailroom(m_last) == 0) {
+ m_last = rte_pktmbuf_alloc(mp);
+ if (unlikely(m_last == NULL)) {
+ rte_pktmbuf_free(mc);
+ return NULL;
+ }
+ ++mc->nb_segs;
+ *prev = m_last;
+ prev = &m_last->next;
+ }
+
+ /*
+ * copy the min of data in input segment (seg)
+ * vs space available in output (m_last)
+ */
+ copy_len = RTE_MIN(seg->data_len - off, len);
+ if (copy_len > rte_pktmbuf_tailroom(m_last))
+ copy_len = rte_pktmbuf_tailroom(m_last);
+
+ /* append from seg to m_last */
+ rte_memcpy_ex(rte_pktmbuf_mtod_offset(m_last, char *,
+ m_last->data_len),
+ rte_pktmbuf_mtod_offset(seg, char *, off),
+ copy_len, flags);
+
+ /* update offsets and lengths */
+ m_last->data_len += copy_len;
+ mc->pkt_len += copy_len;
+ off += copy_len;
+ len -= copy_len;
+ }
+
+ /* garbage out check */
+ __rte_mbuf_sanity_check(mc, 1);
+ return mc;
+}
+
/* dump a mbuf on console */
void
rte_pktmbuf_dump(FILE *f, const struct rte_mbuf *m, unsigned dump_len)
@@ -1443,6 +1443,38 @@ struct rte_mbuf *
rte_pktmbuf_copy(const struct rte_mbuf *m, struct rte_mempool *mp,
uint32_t offset, uint32_t length);
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Create a full copy of a given packet mbuf,
+ * using non-temporal memory access as specified by flags.
+ *
+ * Copies all the data from a given packet mbuf to a newly allocated
+ * set of mbufs. The private data are is not copied.
+ *
+ * @param m
+ * The packet mbuf to be copied.
+ * @param mp
+ * The mempool from which the "clone" mbufs are allocated.
+ * @param offset
+ * The number of bytes to skip before copying.
+ * If the mbuf does not have that many bytes, it is an error
+ * and NULL is returned.
+ * @param length
+ * The upper limit on bytes to copy. Passing UINT32_MAX
+ * means all data (after offset).
+ * @param flags
+ * Non-temporal memory access hints for rte_memcpy_ex.
+ * @return
+ * - The pointer to the new "clone" mbuf on success.
+ * - NULL if allocation fails.
+ */
+__rte_experimental
+struct rte_mbuf *
+rte_pktmbuf_copy_ex(const struct rte_mbuf *m, struct rte_mempool *mp,
+ uint32_t offset, uint32_t length, const uint64_t flags);
+
/**
* Adds given value to the refcnt of all packet mbuf segments.
*
@@ -47,5 +47,6 @@ EXPERIMENTAL {
global:
rte_pktmbuf_pool_create_extbuf;
+ rte_pktmbuf_copy_ex;
};
@@ -466,7 +466,8 @@ rte_pcapng_copy(uint16_t port_id, uint32_t queue,
orig_len = rte_pktmbuf_pkt_len(md);
/* Take snapshot of the data */
- mc = rte_pktmbuf_copy(md, mp, 0, length);
+ mc = rte_pktmbuf_copy_ex(md, mp, 0, length,
+ RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_DST_NT);
if (unlikely(mc == NULL))
return NULL;
@@ -124,7 +124,8 @@ pdump_copy(uint16_t port_id, uint16_t queue,
pkts[i], mp, cbs->snaplen,
ts, direction);
else
- p = rte_pktmbuf_copy(pkts[i], mp, 0, cbs->snaplen);
+ p = rte_pktmbuf_copy_ex(pkts[i], mp, 0, cbs->snaplen,
+ RTE_MEMOPS_F_SRC_NT | RTE_MEMOPS_F_DST_NT);
if (unlikely(p == NULL))
__atomic_fetch_add(&stats->nombuf, 1, __ATOMIC_RELAXED);
@@ -134,6 +135,9 @@ pdump_copy(uint16_t port_id, uint16_t queue,
__atomic_fetch_add(&stats->accepted, d_pkts, __ATOMIC_RELAXED);
+ /* Flush non-temporal stores regarding the packet copies. */
+ rte_wmb();
+
ring_enq = rte_ring_enqueue_burst(ring, (void *)dup_bufs, d_pkts, NULL);
if (unlikely(ring_enq < d_pkts)) {
unsigned int drops = d_pkts - ring_enq;