[v5,5/5] test/thash: add performance tests for the Toeplitz hash

Message ID 1634842469-27119-6-git-send-email-vladimir.medvedkin@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series optimized Toeplitz hash implementation |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/github-robot: build success github build: passed
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS

Commit Message

Vladimir Medvedkin Oct. 21, 2021, 6:54 p.m. UTC
  This patch adds performance tests for different implementations
of the Toeplitz hash function.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 app/test/meson.build       |   2 +
 app/test/test_thash_perf.c | 120 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+)
 create mode 100644 app/test/test_thash_perf.c
  

Comments

Thomas Monjalon Oct. 25, 2021, 5:02 p.m. UTC | #1
21/10/2021 20:54, Vladimir Medvedkin:
> This patch adds performance tests for different implementations
> of the Toeplitz hash function.

Please name them.

> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>

There are some garbage,

> @@ -320,6 +321,7 @@ perf_test_names = [
>          'hash_readwrite_lf_perf_autotest',
>          'trace_perf_autotest',
>          'ipsec_perf_autotest',
> +	'thash_perf_autotest',

here (tabs instead of space)

>  driver_test_names = [
> diff --git a/app/test/test_thash_perf.c b/app/test/test_thash_perf.c
> new file mode 100644
> index 0000000..fb66e20
> --- /dev/null
> +++ b/app/test/test_thash_perf.c
> @@ -0,0 +1,120 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 Intel Corporation
> + */
> +
> +#include <stdio.h>
> +#include <stdint.h>
> +#include <stdlib.h>
> +#include <math.h>
> +
> +#include <rte_cycles.h>
> +#include <rte_malloc.h>
> +#include <rte_random.h>
> +#include <rte_thash.h>
> +
> +#include "test.h"
> +
> +#define ITERATIONS	(1 << 15)
> +#define	BATCH_SZ	(1 << 10)
> +
> +#define IPV4_2_TUPLE_LEN	(8)
> +#define IPV4_4_TUPLE_LEN	(12)
> +#define IPV6_2_TUPLE_LEN	(32)
> +#define IPV6_4_TUPLE_LEN	(36)
> +
> +
> +static uint8_t default_rss_key[] = {
> +	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
> +	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
> +	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
> +	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
> +	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
> +};
> +
> +static void
> +run_thash_test(unsigned int tuple_len)
> +{
> +	uint32_t *tuples[BATCH_SZ];
> +	unsigned int i, j;
> +	uint64_t start_tsc, end_tsc;
> +	uint32_t len = RTE_ALIGN_CEIL(tuple_len, sizeof(uint32_t));
> +	volatile uint32_t hash = 0;
> +	uint32_t bulk_hash[BATCH_SZ] = { 0 };
> +
> +	for (i = 0; i < BATCH_SZ; i++) {
> +		tuples[i] = rte_zmalloc(NULL, len, 0);
> +		for (j = 0; j < len / sizeof(uint32_t); j++)
> +			tuples[i][j] = rte_rand();
> +	}
> +
> +	start_tsc = rte_rdtsc_precise();
> +	for (i = 0; i < ITERATIONS; i++) {
> +		for (j = 0; j < BATCH_SZ; j++) {
> +			hash ^= rte_softrss(tuples[j], len / sizeof(uint32_t),
> +				default_rss_key);
> +		}
> +	}
> +	end_tsc = rte_rdtsc_precise();
> +
> +	printf("Average rte_softrss() takes \t\t%.1f cycles for key len %d\n",
> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
> +		BATCH_SZ), len);
> +
> +	start_tsc = rte_rdtsc_precise();
> +	for (i = 0; i < ITERATIONS; i++) {
> +		for (j = 0; j < BATCH_SZ; j++) {
> +			hash ^= rte_softrss_be(tuples[j], len /
> +				sizeof(uint32_t), default_rss_key);
> +		}
> +	}
> +	end_tsc = rte_rdtsc_precise();
> +
> +	printf("Average rte_softrss_be() takes \t\t%.1f cycles for key len %d\n",
> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
> +		BATCH_SZ), len);

The function could stop here (one function per type of implementation).

> +
> +	if (!rte_thash_gfni_supported())
> +		return;
> +
> +	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
> +
> +	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
> +		RTE_DIM(default_rss_key));
> +
> +	start_tsc = rte_rdtsc_precise();
> +	for (i = 0; i < ITERATIONS; i++) {
> +		for (j = 0; j < BATCH_SZ; j++)
> +			hash ^= rte_thash_gfni(rss_key_matrixes,
> +				(uint8_t *)tuples[j], len);
> +	}
> +	end_tsc = rte_rdtsc_precise();
> +
> +	printf("Average rte_thash_gfni takes \t\t%.1f cycles for key len %d\n",
> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
> +		BATCH_SZ), len);
> +
> +	start_tsc = rte_rdtsc_precise();
> +	for (i = 0; i < ITERATIONS; i++)
> +		rte_thash_gfni_bulk(rss_key_matrixes, len, (uint8_t **)tuples,
> +			bulk_hash, BATCH_SZ);
> +
> +	end_tsc = rte_rdtsc_precise();
> +
> +	printf("Average rte_thash_gfni_x2 takes \t%.1f cycles for key len %d\n",

and here, the function name is not updated.

> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
> +		BATCH_SZ), len);
> +

useless blank line

> +}
  
Stephen Hemminger Oct. 25, 2021, 5:27 p.m. UTC | #2
On Thu, 21 Oct 2021 19:54:29 +0100
Vladimir Medvedkin <vladimir.medvedkin@intel.com> wrote:

> +static uint8_t default_rss_key[] = {

Should this be const?

That way you can make sure API isn't modifying it.
  
Vladimir Medvedkin Oct. 26, 2021, 8:29 p.m. UTC | #3
Hi Thomas,

Thanks for the review, I'll address your comments in v6.
Please find my comment below

On 25/10/2021 19:02, Thomas Monjalon wrote:
> 21/10/2021 20:54, Vladimir Medvedkin:
>> This patch adds performance tests for different implementations
>> of the Toeplitz hash function.
> 
> Please name them.
> 
>> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
> 
> There are some garbage,
> 
>> @@ -320,6 +321,7 @@ perf_test_names = [
>>           'hash_readwrite_lf_perf_autotest',
>>           'trace_perf_autotest',
>>           'ipsec_perf_autotest',
>> +	'thash_perf_autotest',
> 
> here (tabs instead of space)
> 
>>   driver_test_names = [
>> diff --git a/app/test/test_thash_perf.c b/app/test/test_thash_perf.c
>> new file mode 100644
>> index 0000000..fb66e20
>> --- /dev/null
>> +++ b/app/test/test_thash_perf.c
>> @@ -0,0 +1,120 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2021 Intel Corporation
>> + */
>> +
>> +#include <stdio.h>
>> +#include <stdint.h>
>> +#include <stdlib.h>
>> +#include <math.h>
>> +
>> +#include <rte_cycles.h>
>> +#include <rte_malloc.h>
>> +#include <rte_random.h>
>> +#include <rte_thash.h>
>> +
>> +#include "test.h"
>> +
>> +#define ITERATIONS	(1 << 15)
>> +#define	BATCH_SZ	(1 << 10)
>> +
>> +#define IPV4_2_TUPLE_LEN	(8)
>> +#define IPV4_4_TUPLE_LEN	(12)
>> +#define IPV6_2_TUPLE_LEN	(32)
>> +#define IPV6_4_TUPLE_LEN	(36)
>> +
>> +
>> +static uint8_t default_rss_key[] = {
>> +	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
>> +	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
>> +	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
>> +	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
>> +	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
>> +};
>> +
>> +static void
>> +run_thash_test(unsigned int tuple_len)
>> +{
>> +	uint32_t *tuples[BATCH_SZ];
>> +	unsigned int i, j;
>> +	uint64_t start_tsc, end_tsc;
>> +	uint32_t len = RTE_ALIGN_CEIL(tuple_len, sizeof(uint32_t));
>> +	volatile uint32_t hash = 0;
>> +	uint32_t bulk_hash[BATCH_SZ] = { 0 };
>> +
>> +	for (i = 0; i < BATCH_SZ; i++) {
>> +		tuples[i] = rte_zmalloc(NULL, len, 0);
>> +		for (j = 0; j < len / sizeof(uint32_t); j++)
>> +			tuples[i][j] = rte_rand();
>> +	}
>> +
>> +	start_tsc = rte_rdtsc_precise();
>> +	for (i = 0; i < ITERATIONS; i++) {
>> +		for (j = 0; j < BATCH_SZ; j++) {
>> +			hash ^= rte_softrss(tuples[j], len / sizeof(uint32_t),
>> +				default_rss_key);
>> +		}
>> +	}
>> +	end_tsc = rte_rdtsc_precise();
>> +
>> +	printf("Average rte_softrss() takes \t\t%.1f cycles for key len %d\n",
>> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
>> +		BATCH_SZ), len);
>> +
>> +	start_tsc = rte_rdtsc_precise();
>> +	for (i = 0; i < ITERATIONS; i++) {
>> +		for (j = 0; j < BATCH_SZ; j++) {
>> +			hash ^= rte_softrss_be(tuples[j], len /
>> +				sizeof(uint32_t), default_rss_key);
>> +		}
>> +	}
>> +	end_tsc = rte_rdtsc_precise();
>> +
>> +	printf("Average rte_softrss_be() takes \t\t%.1f cycles for key len %d\n",
>> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
>> +		BATCH_SZ), len);
> 
> The function could stop here (one function per type of implementation).
> 

Could you please clarify what do you mean?
The function stops here if the machine do not support GFNI, and this is 
done intentionally. On machine without GFNI it tests only scalar 
implementations for every given length.

>> +
>> +	if (!rte_thash_gfni_supported())
>> +		return;
>> +
>> +	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
>> +
>> +	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
>> +		RTE_DIM(default_rss_key));
>> +
>> +	start_tsc = rte_rdtsc_precise();
>> +	for (i = 0; i < ITERATIONS; i++) {
>> +		for (j = 0; j < BATCH_SZ; j++)
>> +			hash ^= rte_thash_gfni(rss_key_matrixes,
>> +				(uint8_t *)tuples[j], len);
>> +	}
>> +	end_tsc = rte_rdtsc_precise();
>> +
>> +	printf("Average rte_thash_gfni takes \t\t%.1f cycles for key len %d\n",
>> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
>> +		BATCH_SZ), len);
>> +
>> +	start_tsc = rte_rdtsc_precise();
>> +	for (i = 0; i < ITERATIONS; i++)
>> +		rte_thash_gfni_bulk(rss_key_matrixes, len, (uint8_t **)tuples,
>> +			bulk_hash, BATCH_SZ);
>> +
>> +	end_tsc = rte_rdtsc_precise();
>> +
>> +	printf("Average rte_thash_gfni_x2 takes \t%.1f cycles for key len %d\n",
> 
> and here, the function name is not updated.
> 
>> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
>> +		BATCH_SZ), len);
>> +
> 
> useless blank line
> 
>> +}
> 
> 
>
  
Vladimir Medvedkin Oct. 26, 2021, 8:31 p.m. UTC | #4
Hi Stephen,

On 25/10/2021 19:27, Stephen Hemminger wrote:
> On Thu, 21 Oct 2021 19:54:29 +0100
> Vladimir Medvedkin <vladimir.medvedkin@intel.com> wrote:
> 
>> +static uint8_t default_rss_key[] = {
> 
> Should this be const?
> 
> That way you can make sure API isn't modifying it.
> 

Thanks, I'll fix this in v6
  
Thomas Monjalon Oct. 27, 2021, 8:29 a.m. UTC | #5
26/10/2021 22:29, Medvedkin, Vladimir:
> Hi Thomas,
> 
> Thanks for the review, I'll address your comments in v6.
> Please find my comment below
> 
> On 25/10/2021 19:02, Thomas Monjalon wrote:
> > 21/10/2021 20:54, Vladimir Medvedkin:
> >> This patch adds performance tests for different implementations
> >> of the Toeplitz hash function.
> > 
> > Please name them.
> > 
> >> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
> > 
> > There are some garbage,
> > 
> >> @@ -320,6 +321,7 @@ perf_test_names = [
> >>           'hash_readwrite_lf_perf_autotest',
> >>           'trace_perf_autotest',
> >>           'ipsec_perf_autotest',
> >> +	'thash_perf_autotest',
> > 
> > here (tabs instead of space)
> > 
> >>   driver_test_names = [
> >> diff --git a/app/test/test_thash_perf.c b/app/test/test_thash_perf.c
> >> new file mode 100644
> >> index 0000000..fb66e20
> >> --- /dev/null
> >> +++ b/app/test/test_thash_perf.c
> >> @@ -0,0 +1,120 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2021 Intel Corporation
> >> + */
> >> +
> >> +#include <stdio.h>
> >> +#include <stdint.h>
> >> +#include <stdlib.h>
> >> +#include <math.h>
> >> +
> >> +#include <rte_cycles.h>
> >> +#include <rte_malloc.h>
> >> +#include <rte_random.h>
> >> +#include <rte_thash.h>
> >> +
> >> +#include "test.h"
> >> +
> >> +#define ITERATIONS	(1 << 15)
> >> +#define	BATCH_SZ	(1 << 10)
> >> +
> >> +#define IPV4_2_TUPLE_LEN	(8)
> >> +#define IPV4_4_TUPLE_LEN	(12)
> >> +#define IPV6_2_TUPLE_LEN	(32)
> >> +#define IPV6_4_TUPLE_LEN	(36)
> >> +
> >> +
> >> +static uint8_t default_rss_key[] = {
> >> +	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
> >> +	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
> >> +	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
> >> +	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
> >> +	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
> >> +};
> >> +
> >> +static void
> >> +run_thash_test(unsigned int tuple_len)
> >> +{
> >> +	uint32_t *tuples[BATCH_SZ];
> >> +	unsigned int i, j;
> >> +	uint64_t start_tsc, end_tsc;
> >> +	uint32_t len = RTE_ALIGN_CEIL(tuple_len, sizeof(uint32_t));
> >> +	volatile uint32_t hash = 0;
> >> +	uint32_t bulk_hash[BATCH_SZ] = { 0 };
> >> +
> >> +	for (i = 0; i < BATCH_SZ; i++) {
> >> +		tuples[i] = rte_zmalloc(NULL, len, 0);
> >> +		for (j = 0; j < len / sizeof(uint32_t); j++)
> >> +			tuples[i][j] = rte_rand();
> >> +	}
> >> +
> >> +	start_tsc = rte_rdtsc_precise();
> >> +	for (i = 0; i < ITERATIONS; i++) {
> >> +		for (j = 0; j < BATCH_SZ; j++) {
> >> +			hash ^= rte_softrss(tuples[j], len / sizeof(uint32_t),
> >> +				default_rss_key);
> >> +		}
> >> +	}
> >> +	end_tsc = rte_rdtsc_precise();
> >> +
> >> +	printf("Average rte_softrss() takes \t\t%.1f cycles for key len %d\n",
> >> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
> >> +		BATCH_SZ), len);
> >> +
> >> +	start_tsc = rte_rdtsc_precise();
> >> +	for (i = 0; i < ITERATIONS; i++) {
> >> +		for (j = 0; j < BATCH_SZ; j++) {
> >> +			hash ^= rte_softrss_be(tuples[j], len /
> >> +				sizeof(uint32_t), default_rss_key);
> >> +		}
> >> +	}
> >> +	end_tsc = rte_rdtsc_precise();
> >> +
> >> +	printf("Average rte_softrss_be() takes \t\t%.1f cycles for key len %d\n",
> >> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
> >> +		BATCH_SZ), len);
> > 
> > The function could stop here (one function per type of implementation).
> > 
> 
> Could you please clarify what do you mean?
> The function stops here if the machine do not support GFNI, and this is 
> done intentionally. On machine without GFNI it tests only scalar 
> implementations for every given length.

No I mean you can split in smaller functions.

> >> +
> >> +	if (!rte_thash_gfni_supported())
> >> +		return;
> >> +
> >> +	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
> >> +
> >> +	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
> >> +		RTE_DIM(default_rss_key));
> >> +
> >> +	start_tsc = rte_rdtsc_precise();
> >> +	for (i = 0; i < ITERATIONS; i++) {
> >> +		for (j = 0; j < BATCH_SZ; j++)
> >> +			hash ^= rte_thash_gfni(rss_key_matrixes,
> >> +				(uint8_t *)tuples[j], len);
> >> +	}
> >> +	end_tsc = rte_rdtsc_precise();
> >> +
> >> +	printf("Average rte_thash_gfni takes \t\t%.1f cycles for key len %d\n",
> >> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
> >> +		BATCH_SZ), len);
> >> +
> >> +	start_tsc = rte_rdtsc_precise();
> >> +	for (i = 0; i < ITERATIONS; i++)
> >> +		rte_thash_gfni_bulk(rss_key_matrixes, len, (uint8_t **)tuples,
> >> +			bulk_hash, BATCH_SZ);
> >> +
> >> +	end_tsc = rte_rdtsc_precise();
> >> +
> >> +	printf("Average rte_thash_gfni_x2 takes \t%.1f cycles for key len %d\n",
> > 
> > and here, the function name is not updated.
> > 
> >> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
> >> +		BATCH_SZ), len);
> >> +
> > 
> > useless blank line
> > 
> >> +}
  
Vladimir Medvedkin Oct. 27, 2021, 3:48 p.m. UTC | #6
Hi Thomas,

On 27/10/2021 10:29, Thomas Monjalon wrote:
> 26/10/2021 22:29, Medvedkin, Vladimir:
>> Hi Thomas,
>>
>> Thanks for the review, I'll address your comments in v6.
>> Please find my comment below
>>
>> On 25/10/2021 19:02, Thomas Monjalon wrote:
>>> 21/10/2021 20:54, Vladimir Medvedkin:
>>>> This patch adds performance tests for different implementations
>>>> of the Toeplitz hash function.
>>>
>>> Please name them.
>>>
>>>> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
>>>
>>> There are some garbage,
>>>
>>>> @@ -320,6 +321,7 @@ perf_test_names = [
>>>>            'hash_readwrite_lf_perf_autotest',
>>>>            'trace_perf_autotest',
>>>>            'ipsec_perf_autotest',
>>>> +	'thash_perf_autotest',
>>>
>>> here (tabs instead of space)
>>>
>>>>    driver_test_names = [
>>>> diff --git a/app/test/test_thash_perf.c b/app/test/test_thash_perf.c
>>>> new file mode 100644
>>>> index 0000000..fb66e20
>>>> --- /dev/null
>>>> +++ b/app/test/test_thash_perf.c
>>>> @@ -0,0 +1,120 @@
>>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>>> + * Copyright(c) 2021 Intel Corporation
>>>> + */
>>>> +
>>>> +#include <stdio.h>
>>>> +#include <stdint.h>
>>>> +#include <stdlib.h>
>>>> +#include <math.h>
>>>> +
>>>> +#include <rte_cycles.h>
>>>> +#include <rte_malloc.h>
>>>> +#include <rte_random.h>
>>>> +#include <rte_thash.h>
>>>> +
>>>> +#include "test.h"
>>>> +
>>>> +#define ITERATIONS	(1 << 15)
>>>> +#define	BATCH_SZ	(1 << 10)
>>>> +
>>>> +#define IPV4_2_TUPLE_LEN	(8)
>>>> +#define IPV4_4_TUPLE_LEN	(12)
>>>> +#define IPV6_2_TUPLE_LEN	(32)
>>>> +#define IPV6_4_TUPLE_LEN	(36)
>>>> +
>>>> +
>>>> +static uint8_t default_rss_key[] = {
>>>> +	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
>>>> +	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
>>>> +	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
>>>> +	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
>>>> +	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
>>>> +};
>>>> +
>>>> +static void
>>>> +run_thash_test(unsigned int tuple_len)
>>>> +{
>>>> +	uint32_t *tuples[BATCH_SZ];
>>>> +	unsigned int i, j;
>>>> +	uint64_t start_tsc, end_tsc;
>>>> +	uint32_t len = RTE_ALIGN_CEIL(tuple_len, sizeof(uint32_t));
>>>> +	volatile uint32_t hash = 0;
>>>> +	uint32_t bulk_hash[BATCH_SZ] = { 0 };
>>>> +
>>>> +	for (i = 0; i < BATCH_SZ; i++) {
>>>> +		tuples[i] = rte_zmalloc(NULL, len, 0);
>>>> +		for (j = 0; j < len / sizeof(uint32_t); j++)
>>>> +			tuples[i][j] = rte_rand();
>>>> +	}
>>>> +
>>>> +	start_tsc = rte_rdtsc_precise();
>>>> +	for (i = 0; i < ITERATIONS; i++) {
>>>> +		for (j = 0; j < BATCH_SZ; j++) {
>>>> +			hash ^= rte_softrss(tuples[j], len / sizeof(uint32_t),
>>>> +				default_rss_key);
>>>> +		}
>>>> +	}
>>>> +	end_tsc = rte_rdtsc_precise();
>>>> +
>>>> +	printf("Average rte_softrss() takes \t\t%.1f cycles for key len %d\n",
>>>> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
>>>> +		BATCH_SZ), len);
>>>> +
>>>> +	start_tsc = rte_rdtsc_precise();
>>>> +	for (i = 0; i < ITERATIONS; i++) {
>>>> +		for (j = 0; j < BATCH_SZ; j++) {
>>>> +			hash ^= rte_softrss_be(tuples[j], len /
>>>> +				sizeof(uint32_t), default_rss_key);
>>>> +		}
>>>> +	}
>>>> +	end_tsc = rte_rdtsc_precise();
>>>> +
>>>> +	printf("Average rte_softrss_be() takes \t\t%.1f cycles for key len %d\n",
>>>> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
>>>> +		BATCH_SZ), len);
>>>
>>> The function could stop here (one function per type of implementation).
>>>
>>
>> Could you please clarify what do you mean?
>> The function stops here if the machine do not support GFNI, and this is
>> done intentionally. On machine without GFNI it tests only scalar
>> implementations for every given length.
> 
> No I mean you can split in smaller functions.
> 

Aha, I see, I'll send v7.

>>>> +
>>>> +	if (!rte_thash_gfni_supported())
>>>> +		return;
>>>> +
>>>> +	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
>>>> +
>>>> +	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
>>>> +		RTE_DIM(default_rss_key));
>>>> +
>>>> +	start_tsc = rte_rdtsc_precise();
>>>> +	for (i = 0; i < ITERATIONS; i++) {
>>>> +		for (j = 0; j < BATCH_SZ; j++)
>>>> +			hash ^= rte_thash_gfni(rss_key_matrixes,
>>>> +				(uint8_t *)tuples[j], len);
>>>> +	}
>>>> +	end_tsc = rte_rdtsc_precise();
>>>> +
>>>> +	printf("Average rte_thash_gfni takes \t\t%.1f cycles for key len %d\n",
>>>> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
>>>> +		BATCH_SZ), len);
>>>> +
>>>> +	start_tsc = rte_rdtsc_precise();
>>>> +	for (i = 0; i < ITERATIONS; i++)
>>>> +		rte_thash_gfni_bulk(rss_key_matrixes, len, (uint8_t **)tuples,
>>>> +			bulk_hash, BATCH_SZ);
>>>> +
>>>> +	end_tsc = rte_rdtsc_precise();
>>>> +
>>>> +	printf("Average rte_thash_gfni_x2 takes \t%.1f cycles for key len %d\n",
>>>
>>> and here, the function name is not updated.
>>>
>>>> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
>>>> +		BATCH_SZ), len);
>>>> +
>>>
>>> useless blank line
>>>
>>>> +}
> 
> 
>
  

Patch

diff --git a/app/test/meson.build b/app/test/meson.build
index ba2600a..8b9e6e9 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -144,6 +144,7 @@  test_sources = files(
         'test_table_tables.c',
         'test_tailq.c',
         'test_thash.c',
+        'test_thash_perf.c',
         'test_timer.c',
         'test_timer_perf.c',
         'test_timer_racecond.c',
@@ -320,6 +321,7 @@  perf_test_names = [
         'hash_readwrite_lf_perf_autotest',
         'trace_perf_autotest',
         'ipsec_perf_autotest',
+	'thash_perf_autotest',
 ]
 
 driver_test_names = [
diff --git a/app/test/test_thash_perf.c b/app/test/test_thash_perf.c
new file mode 100644
index 0000000..fb66e20
--- /dev/null
+++ b/app/test/test_thash_perf.c
@@ -0,0 +1,120 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include <rte_cycles.h>
+#include <rte_malloc.h>
+#include <rte_random.h>
+#include <rte_thash.h>
+
+#include "test.h"
+
+#define ITERATIONS	(1 << 15)
+#define	BATCH_SZ	(1 << 10)
+
+#define IPV4_2_TUPLE_LEN	(8)
+#define IPV4_4_TUPLE_LEN	(12)
+#define IPV6_2_TUPLE_LEN	(32)
+#define IPV6_4_TUPLE_LEN	(36)
+
+
+static uint8_t default_rss_key[] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
+static void
+run_thash_test(unsigned int tuple_len)
+{
+	uint32_t *tuples[BATCH_SZ];
+	unsigned int i, j;
+	uint64_t start_tsc, end_tsc;
+	uint32_t len = RTE_ALIGN_CEIL(tuple_len, sizeof(uint32_t));
+	volatile uint32_t hash = 0;
+	uint32_t bulk_hash[BATCH_SZ] = { 0 };
+
+	for (i = 0; i < BATCH_SZ; i++) {
+		tuples[i] = rte_zmalloc(NULL, len, 0);
+		for (j = 0; j < len / sizeof(uint32_t); j++)
+			tuples[i][j] = rte_rand();
+	}
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++) {
+			hash ^= rte_softrss(tuples[j], len / sizeof(uint32_t),
+				default_rss_key);
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_softrss() takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++) {
+			hash ^= rte_softrss_be(tuples[j], len /
+				sizeof(uint32_t), default_rss_key);
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_softrss_be() takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	if (!rte_thash_gfni_supported())
+		return;
+
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++)
+			hash ^= rte_thash_gfni(rss_key_matrixes,
+				(uint8_t *)tuples[j], len);
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_thash_gfni takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++)
+		rte_thash_gfni_bulk(rss_key_matrixes, len, (uint8_t **)tuples,
+			bulk_hash, BATCH_SZ);
+
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_thash_gfni_x2 takes \t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+}
+
+static int
+test_thash_perf(void)
+{
+	run_thash_test(IPV4_2_TUPLE_LEN);
+	run_thash_test(IPV4_4_TUPLE_LEN);
+	run_thash_test(IPV6_2_TUPLE_LEN);
+	run_thash_test(IPV6_4_TUPLE_LEN);
+
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(thash_perf_autotest, test_thash_perf);