[v2,1/2] app/test: add cksum performance test

Message ID 20220708125608.24532-1-mattias.ronnblom@ericsson.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series [v2,1/2] app/test: add cksum performance test |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Mattias Rönnblom July 8, 2022, 12:56 p.m. UTC
  Add performance test for the rte_raw_cksum() function, which delegates
the actual work to __rte_raw_cksum(), which in turn is used by other
functions in need of Internet checksum calculation.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>

---

v2:
  * Added __rte_unused to unused volatile variable, to keep the Intel
    compiler happy.
---
 MAINTAINERS                |   1 +
 app/test/meson.build       |   1 +
 app/test/test_cksum_perf.c | 118 +++++++++++++++++++++++++++++++++++++
 3 files changed, 120 insertions(+)
 create mode 100644 app/test/test_cksum_perf.c
  

Comments

Olivier Matz July 11, 2022, 9:47 a.m. UTC | #1
Hi Mattias,

Please see few comments below.

On Fri, Jul 08, 2022 at 02:56:07PM +0200, Mattias Rönnblom wrote:
> Add performance test for the rte_raw_cksum() function, which delegates
> the actual work to __rte_raw_cksum(), which in turn is used by other
> functions in need of Internet checksum calculation.
> 
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> 
> ---
> 
> v2:
>   * Added __rte_unused to unused volatile variable, to keep the Intel
>     compiler happy.
> ---
>  MAINTAINERS                |   1 +
>  app/test/meson.build       |   1 +
>  app/test/test_cksum_perf.c | 118 +++++++++++++++++++++++++++++++++++++
>  3 files changed, 120 insertions(+)
>  create mode 100644 app/test/test_cksum_perf.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index c923712946..2a4c99e05a 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1414,6 +1414,7 @@ Network headers
>  M: Olivier Matz <olivier.matz@6wind.com>
>  F: lib/net/
>  F: app/test/test_cksum.c
> +F: app/test/test_cksum_perf.c
>  
>  Packet CRC
>  M: Jasvinder Singh <jasvinder.singh@intel.com>
> diff --git a/app/test/meson.build b/app/test/meson.build
> index 431c5bd318..191db03d1d 100644
> --- a/app/test/meson.build
> +++ b/app/test/meson.build
> @@ -18,6 +18,7 @@ test_sources = files(
>          'test_bpf.c',
>          'test_byteorder.c',
>          'test_cksum.c',
> +        'test_cksum_perf.c',
>          'test_cmdline.c',
>          'test_cmdline_cirbuf.c',
>          'test_cmdline_etheraddr.c',
> diff --git a/app/test/test_cksum_perf.c b/app/test/test_cksum_perf.c
> new file mode 100644
> index 0000000000..bff73cb3bb
> --- /dev/null
> +++ b/app/test/test_cksum_perf.c
> @@ -0,0 +1,118 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Ericsson AB
> + */
> +
> +#include <stdio.h>
> +
> +#include <rte_common.h>
> +#include <rte_cycles.h>
> +#include <rte_ip.h>
> +#include <rte_malloc.h>
> +#include <rte_random.h>
> +
> +#include "test.h"
> +
> +#define NUM_BLOCKS (10)
> +#define ITERATIONS (1000000)

Parenthesis can be safely removed

> +
> +static const size_t data_sizes[] = { 20, 21, 100, 101, 1500, 1501 };
> +
> +static __rte_noinline uint16_t
> +do_rte_raw_cksum(const void *buf, size_t len)
> +{
> +	return rte_raw_cksum(buf, len);
> +}

I don't understand the need to have this wrapper, especially marked
__rte_noinline. What is the objective?

Note that when I remove the __rte_noinline, the performance is better
for size 20 and 21.

> +
> +static void
> +init_block(void *buf, size_t len)

Can buf be a (char *) instead?
It would avoid a cast below.

> +{
> +	size_t i;
> +
> +	for (i = 0; i < len; i++)
> +		((char *)buf)[i] = (uint8_t)rte_rand();
> +}
> +
> +static int
> +test_cksum_perf_size_alignment(size_t block_size, bool aligned)
> +{
> +	char *data[NUM_BLOCKS];
> +	char *blocks[NUM_BLOCKS];
> +	unsigned int i;
> +	uint64_t start;
> +	uint64_t end;
> +	/* Floating point to handle low (pseudo-)TSC frequencies */
> +	double block_latency;
> +	double byte_latency;
> +	volatile __rte_unused uint64_t sum = 0;
> +
> +	for (i = 0; i < NUM_BLOCKS; i++) {
> +		data[i] = rte_malloc(NULL, block_size + 1, 0);
> +
> +		if (data[i] == NULL) {
> +			printf("Failed to allocate memory for block\n");
> +			return TEST_FAILED;
> +		}
> +
> +		init_block(data[i], block_size + 1);
> +
> +		blocks[i] = aligned ? data[i] : data[i] + 1;
> +	}
> +
> +	start = rte_rdtsc();
> +
> +	for (i = 0; i < ITERATIONS; i++) {
> +		unsigned int j;
> +		for (j = 0; j < NUM_BLOCKS; j++)
> +			sum += do_rte_raw_cksum(blocks[j], block_size);
> +	}
> +
> +	end = rte_rdtsc();
> +
> +	block_latency = (end - start) / (double)(ITERATIONS * NUM_BLOCKS);
> +	byte_latency = block_latency / block_size;
> +
> +	printf("%-9s %10zd %19.1f %16.2f\n", aligned ? "Aligned" : "Unaligned",
> +	       block_size, block_latency, byte_latency);

When I run the test on my dev machine, I get the following results,
which are quite reproductible:

Aligned           20       10.4      0.52     (range is 0.48 - 0.52)
Unaligned         20        7.9      0.39     (range is 0.39 - 0.40)
...

If I increase the number of iterations, the first results
change significantly:

Aligned           20        8.2      0.42     (range is 0.41 - 0.42)
Unaligned         20        8.0      0.40     (always this value)

To have more precise tests with small size, would it make sense to
target a test time instead of an iteration count? Something like
this:

	#define ITERATIONS 1000000
	uint64_t iterations = 0;

	...

	do {
		for (i = 0; i < ITERATIONS; i++) {
			unsigned int j;
			for (j = 0; j < NUM_BLOCKS; j++)
				sum += do_rte_raw_cksum(blocks[j], block_size);
		}
		iterations += ITERATIONS;
		end = rte_rdtsc();
	} while ((end - start) < rte_get_tsc_hz());

	block_latency = (end - start) / (double)(iterations * NUM_BLOCKS);


After this change, the aligned and unaligned cases have the same
performance on my machine.


> +
> +	for (i = 0; i < NUM_BLOCKS; i++)
> +		rte_free(data[i]);
> +
> +	return TEST_SUCCESS;
> +}
> +
> +static int
> +test_cksum_perf_size(size_t block_size)
> +{
> +	int rc;
> +
> +	rc = test_cksum_perf_size_alignment(block_size, true);
> +	if (rc != TEST_SUCCESS)
> +		return rc;
> +
> +	rc = test_cksum_perf_size_alignment(block_size, false);
> +
> +	return rc;
> +}
> +
> +static int
> +test_cksum_perf(void)
> +{
> +	uint16_t i;
> +
> +	printf("### rte_raw_cksum() performance ###\n");
> +	printf("Alignment  Block size    TSC cycles/block  TSC cycles/byte\n");
> +
> +	for (i = 0; i < RTE_DIM(data_sizes); i++) {
> +		int rc;
> +
> +		rc = test_cksum_perf_size(data_sizes[i]);
> +		if (rc != TEST_SUCCESS)
> +			return rc;
> +	}
> +
> +	return TEST_SUCCESS;
> +}
> +
> +
> +REGISTER_TEST_COMMAND(cksum_perf_autotest, test_cksum_perf);
> +

The last empty line can be removed.

> -- 
> 2.25.1
>
  
Mattias Rönnblom July 11, 2022, 10:42 a.m. UTC | #2
On 2022-07-11 11:47, Olivier Matz wrote:
> Hi Mattias,
> 
> Please see few comments below.
> 
> On Fri, Jul 08, 2022 at 02:56:07PM +0200, Mattias Rönnblom wrote:
>> Add performance test for the rte_raw_cksum() function, which delegates
>> the actual work to __rte_raw_cksum(), which in turn is used by other
>> functions in need of Internet checksum calculation.
>>
>> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>>
>> ---
>>
>> v2:
>>    * Added __rte_unused to unused volatile variable, to keep the Intel
>>      compiler happy.
>> ---
>>   MAINTAINERS                |   1 +
>>   app/test/meson.build       |   1 +
>>   app/test/test_cksum_perf.c | 118 +++++++++++++++++++++++++++++++++++++
>>   3 files changed, 120 insertions(+)
>>   create mode 100644 app/test/test_cksum_perf.c
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index c923712946..2a4c99e05a 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -1414,6 +1414,7 @@ Network headers
>>   M: Olivier Matz <olivier.matz@6wind.com>
>>   F: lib/net/
>>   F: app/test/test_cksum.c
>> +F: app/test/test_cksum_perf.c
>>   
>>   Packet CRC
>>   M: Jasvinder Singh <jasvinder.singh@intel.com>
>> diff --git a/app/test/meson.build b/app/test/meson.build
>> index 431c5bd318..191db03d1d 100644
>> --- a/app/test/meson.build
>> +++ b/app/test/meson.build
>> @@ -18,6 +18,7 @@ test_sources = files(
>>           'test_bpf.c',
>>           'test_byteorder.c',
>>           'test_cksum.c',
>> +        'test_cksum_perf.c',
>>           'test_cmdline.c',
>>           'test_cmdline_cirbuf.c',
>>           'test_cmdline_etheraddr.c',
>> diff --git a/app/test/test_cksum_perf.c b/app/test/test_cksum_perf.c
>> new file mode 100644
>> index 0000000000..bff73cb3bb
>> --- /dev/null
>> +++ b/app/test/test_cksum_perf.c
>> @@ -0,0 +1,118 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2022 Ericsson AB
>> + */
>> +
>> +#include <stdio.h>
>> +
>> +#include <rte_common.h>
>> +#include <rte_cycles.h>
>> +#include <rte_ip.h>
>> +#include <rte_malloc.h>
>> +#include <rte_random.h>
>> +
>> +#include "test.h"
>> +
>> +#define NUM_BLOCKS (10)
>> +#define ITERATIONS (1000000)
> 
> Parenthesis can be safely removed
> 
>> +
>> +static const size_t data_sizes[] = { 20, 21, 100, 101, 1500, 1501 };
>> +
>> +static __rte_noinline uint16_t
>> +do_rte_raw_cksum(const void *buf, size_t len)
>> +{
>> +	return rte_raw_cksum(buf, len);
>> +}
> 
> I don't understand the need to have this wrapper, especially marked
> __rte_noinline. What is the objective?
> 

The intention is to disallow the compiler to perform unrolling and 
integrating/interleave one cksum operating with the next buffer's in a 
way that wouldn't be feasable in a real application.

It will result in an overestimation of the cost for small cksums, so 
it's still misleading, but in another direction. :)

> Note that when I remove the __rte_noinline, the performance is better
> for size 20 and 21.
> 
>> +
>> +static void
>> +init_block(void *buf, size_t len)
> 
> Can buf be a (char *) instead?
> It would avoid a cast below.
> 

Yes.

>> +{
>> +	size_t i;
>> +
>> +	for (i = 0; i < len; i++)
>> +		((char *)buf)[i] = (uint8_t)rte_rand();
>> +}
>> +
>> +static int
>> +test_cksum_perf_size_alignment(size_t block_size, bool aligned)
>> +{
>> +	char *data[NUM_BLOCKS];
>> +	char *blocks[NUM_BLOCKS];
>> +	unsigned int i;
>> +	uint64_t start;
>> +	uint64_t end;
>> +	/* Floating point to handle low (pseudo-)TSC frequencies */
>> +	double block_latency;
>> +	double byte_latency;
>> +	volatile __rte_unused uint64_t sum = 0;
>> +
>> +	for (i = 0; i < NUM_BLOCKS; i++) {
>> +		data[i] = rte_malloc(NULL, block_size + 1, 0);
>> +
>> +		if (data[i] == NULL) {
>> +			printf("Failed to allocate memory for block\n");
>> +			return TEST_FAILED;
>> +		}
>> +
>> +		init_block(data[i], block_size + 1);
>> +
>> +		blocks[i] = aligned ? data[i] : data[i] + 1;
>> +	}
>> +
>> +	start = rte_rdtsc();
>> +
>> +	for (i = 0; i < ITERATIONS; i++) {
>> +		unsigned int j;
>> +		for (j = 0; j < NUM_BLOCKS; j++)
>> +			sum += do_rte_raw_cksum(blocks[j], block_size);
>> +	}
>> +
>> +	end = rte_rdtsc();
>> +
>> +	block_latency = (end - start) / (double)(ITERATIONS * NUM_BLOCKS);
>> +	byte_latency = block_latency / block_size;
>> +
>> +	printf("%-9s %10zd %19.1f %16.2f\n", aligned ? "Aligned" : "Unaligned",
>> +	       block_size, block_latency, byte_latency);
> 
> When I run the test on my dev machine, I get the following results,
> which are quite reproductible:
> 
> Aligned           20       10.4      0.52     (range is 0.48 - 0.52)
> Unaligned         20        7.9      0.39     (range is 0.39 - 0.40)
> ...
> 
> If I increase the number of iterations, the first results
> change significantly:
> 
> Aligned           20        8.2      0.42     (range is 0.41 - 0.42)
> Unaligned         20        8.0      0.40     (always this value)


I suspect you have frequency scaling enabled on your system. This is 
generally not advisable, you want to some level of determinism in when 
benchmarking. Especially on short runs like this is (and must be).

I thought about doing something about this, but it seemed like an issue 
that should be addressed on a framework level, rather than on a per-perf 
autotest level.

If you want your CPU core to scale up, you can just insert

rte_delay_block_us(100000);

before the actual test is run.

Should I add this? I *think* 100 ms should be enough, but maybe someone 
with more in-depth knowledge of the frequency governors can comment on this.

> 
> To have more precise tests with small size, would it make sense to
> target a test time instead of an iteration count? Something like
> this:
> 

The time lost when running on a lower frequency (plus the hiccups when 
the frequency is changed) will be amortized as you add to the length of 
the test run, which will partly solved the problem. A better solution is 
to not start the test before the core runs on the max frequency.

Again, this is assuming DVFS is what you suffer from here. I guess in 
theory it could be TLB miss as well.

> 	#define ITERATIONS 1000000
> 	uint64_t iterations = 0;
> 
> 	...
> 
> 	do {
> 		for (i = 0; i < ITERATIONS; i++) {
> 			unsigned int j;
> 			for (j = 0; j < NUM_BLOCKS; j++)
> 				sum += do_rte_raw_cksum(blocks[j], block_size);
> 		}
> 		iterations += ITERATIONS;
> 		end = rte_rdtsc();
> 	} while ((end - start) < rte_get_tsc_hz());
> 
> 	block_latency = (end - start) / (double)(iterations * NUM_BLOCKS);
> 
> 
> After this change, the aligned and unaligned cases have the same
> performance on my machine.
> 
> 

RTE>>cksum_perf_autotest
### rte_raw_cksum() performance ###
Alignment  Block size    TSC cycles/block  TSC cycles/byte
Aligned           20                16.1             0.81
Unaligned         20                16.1             0.81

... with the 100 ms busy-wait delay (and frequency scaling enabled) on 
my AMD machine.

>> +
>> +	for (i = 0; i < NUM_BLOCKS; i++)
>> +		rte_free(data[i]);
>> +
>> +	return TEST_SUCCESS;
>> +}
>> +
>> +static int
>> +test_cksum_perf_size(size_t block_size)
>> +{
>> +	int rc;
>> +
>> +	rc = test_cksum_perf_size_alignment(block_size, true);
>> +	if (rc != TEST_SUCCESS)
>> +		return rc;
>> +
>> +	rc = test_cksum_perf_size_alignment(block_size, false);
>> +
>> +	return rc;
>> +}
>> +
>> +static int
>> +test_cksum_perf(void)
>> +{
>> +	uint16_t i;
>> +
>> +	printf("### rte_raw_cksum() performance ###\n");
>> +	printf("Alignment  Block size    TSC cycles/block  TSC cycles/byte\n");
>> +
>> +	for (i = 0; i < RTE_DIM(data_sizes); i++) {
>> +		int rc;
>> +
>> +		rc = test_cksum_perf_size(data_sizes[i]);
>> +		if (rc != TEST_SUCCESS)
>> +			return rc;
>> +	}
>> +
>> +	return TEST_SUCCESS;
>> +}
>> +
>> +
>> +REGISTER_TEST_COMMAND(cksum_perf_autotest, test_cksum_perf);
>> +
> 
> The last empty line can be removed.
> 

OK.

Thanks for the review. I will send a v3 as soon as we've settled the 
DVFS issue.

>> -- 
>> 2.25.1
>>
  
Olivier Matz July 11, 2022, 11:33 a.m. UTC | #3
On Mon, Jul 11, 2022 at 10:42:37AM +0000, Mattias Rönnblom wrote:
> On 2022-07-11 11:47, Olivier Matz wrote:
> > Hi Mattias,
> > 
> > Please see few comments below.
> > 
> > On Fri, Jul 08, 2022 at 02:56:07PM +0200, Mattias Rönnblom wrote:
> >> Add performance test for the rte_raw_cksum() function, which delegates
> >> the actual work to __rte_raw_cksum(), which in turn is used by other
> >> functions in need of Internet checksum calculation.
> >>
> >> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> >>
> >> ---
> >>
> >> v2:
> >>    * Added __rte_unused to unused volatile variable, to keep the Intel
> >>      compiler happy.
> >> ---
> >>   MAINTAINERS                |   1 +
> >>   app/test/meson.build       |   1 +
> >>   app/test/test_cksum_perf.c | 118 +++++++++++++++++++++++++++++++++++++
> >>   3 files changed, 120 insertions(+)
> >>   create mode 100644 app/test/test_cksum_perf.c
> >>
> >> diff --git a/MAINTAINERS b/MAINTAINERS
> >> index c923712946..2a4c99e05a 100644
> >> --- a/MAINTAINERS
> >> +++ b/MAINTAINERS
> >> @@ -1414,6 +1414,7 @@ Network headers
> >>   M: Olivier Matz <olivier.matz@6wind.com>
> >>   F: lib/net/
> >>   F: app/test/test_cksum.c
> >> +F: app/test/test_cksum_perf.c
> >>   
> >>   Packet CRC
> >>   M: Jasvinder Singh <jasvinder.singh@intel.com>
> >> diff --git a/app/test/meson.build b/app/test/meson.build
> >> index 431c5bd318..191db03d1d 100644
> >> --- a/app/test/meson.build
> >> +++ b/app/test/meson.build
> >> @@ -18,6 +18,7 @@ test_sources = files(
> >>           'test_bpf.c',
> >>           'test_byteorder.c',
> >>           'test_cksum.c',
> >> +        'test_cksum_perf.c',
> >>           'test_cmdline.c',
> >>           'test_cmdline_cirbuf.c',
> >>           'test_cmdline_etheraddr.c',
> >> diff --git a/app/test/test_cksum_perf.c b/app/test/test_cksum_perf.c
> >> new file mode 100644
> >> index 0000000000..bff73cb3bb
> >> --- /dev/null
> >> +++ b/app/test/test_cksum_perf.c
> >> @@ -0,0 +1,118 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2022 Ericsson AB
> >> + */
> >> +
> >> +#include <stdio.h>
> >> +
> >> +#include <rte_common.h>
> >> +#include <rte_cycles.h>
> >> +#include <rte_ip.h>
> >> +#include <rte_malloc.h>
> >> +#include <rte_random.h>
> >> +
> >> +#include "test.h"
> >> +
> >> +#define NUM_BLOCKS (10)
> >> +#define ITERATIONS (1000000)
> > 
> > Parenthesis can be safely removed
> > 
> >> +
> >> +static const size_t data_sizes[] = { 20, 21, 100, 101, 1500, 1501 };
> >> +
> >> +static __rte_noinline uint16_t
> >> +do_rte_raw_cksum(const void *buf, size_t len)
> >> +{
> >> +	return rte_raw_cksum(buf, len);
> >> +}
> > 
> > I don't understand the need to have this wrapper, especially marked
> > __rte_noinline. What is the objective?
> > 
> 
> The intention is to disallow the compiler to perform unrolling and 
> integrating/interleave one cksum operating with the next buffer's in a 
> way that wouldn't be feasable in a real application.
> 
> It will result in an overestimation of the cost for small cksums, so 
> it's still misleading, but in another direction. :)

OK, got it. I think it's fine like you did then.

> 
> > Note that when I remove the __rte_noinline, the performance is better
> > for size 20 and 21.
> > 
> >> +
> >> +static void
> >> +init_block(void *buf, size_t len)
> > 
> > Can buf be a (char *) instead?
> > It would avoid a cast below.
> > 
> 
> Yes.
> 
> >> +{
> >> +	size_t i;
> >> +
> >> +	for (i = 0; i < len; i++)
> >> +		((char *)buf)[i] = (uint8_t)rte_rand();
> >> +}
> >> +
> >> +static int
> >> +test_cksum_perf_size_alignment(size_t block_size, bool aligned)
> >> +{
> >> +	char *data[NUM_BLOCKS];
> >> +	char *blocks[NUM_BLOCKS];
> >> +	unsigned int i;
> >> +	uint64_t start;
> >> +	uint64_t end;
> >> +	/* Floating point to handle low (pseudo-)TSC frequencies */
> >> +	double block_latency;
> >> +	double byte_latency;
> >> +	volatile __rte_unused uint64_t sum = 0;
> >> +
> >> +	for (i = 0; i < NUM_BLOCKS; i++) {
> >> +		data[i] = rte_malloc(NULL, block_size + 1, 0);
> >> +
> >> +		if (data[i] == NULL) {
> >> +			printf("Failed to allocate memory for block\n");
> >> +			return TEST_FAILED;
> >> +		}
> >> +
> >> +		init_block(data[i], block_size + 1);
> >> +
> >> +		blocks[i] = aligned ? data[i] : data[i] + 1;
> >> +	}
> >> +
> >> +	start = rte_rdtsc();
> >> +
> >> +	for (i = 0; i < ITERATIONS; i++) {
> >> +		unsigned int j;
> >> +		for (j = 0; j < NUM_BLOCKS; j++)
> >> +			sum += do_rte_raw_cksum(blocks[j], block_size);
> >> +	}
> >> +
> >> +	end = rte_rdtsc();
> >> +
> >> +	block_latency = (end - start) / (double)(ITERATIONS * NUM_BLOCKS);
> >> +	byte_latency = block_latency / block_size;
> >> +
> >> +	printf("%-9s %10zd %19.1f %16.2f\n", aligned ? "Aligned" : "Unaligned",
> >> +	       block_size, block_latency, byte_latency);
> > 
> > When I run the test on my dev machine, I get the following results,
> > which are quite reproductible:
> > 
> > Aligned           20       10.4      0.52     (range is 0.48 - 0.52)
> > Unaligned         20        7.9      0.39     (range is 0.39 - 0.40)
> > ...
> > 
> > If I increase the number of iterations, the first results
> > change significantly:
> > 
> > Aligned           20        8.2      0.42     (range is 0.41 - 0.42)
> > Unaligned         20        8.0      0.40     (always this value)
> 
> 
> I suspect you have frequency scaling enabled on your system. This is 
> generally not advisable, you want to some level of determinism in when 
> benchmarking. Especially on short runs like this is (and must be).
> 
> I thought about doing something about this, but it seemed like an issue 
> that should be addressed on a framework level, rather than on a per-perf 
> autotest level.
> 
> If you want your CPU core to scale up, you can just insert
> 
> rte_delay_block_us(100000);
> 
> before the actual test is run.

Your hypothesis is correct. When I disable frequency scaling, the results
are now the same with 100K, 1M or 10M iterations.

However, adding a rte_delay_us_block() does not really solve the issue,
probably because it calls rte_pause() in the loop.

> Should I add this? I *think* 100 ms should be enough, but maybe someone 
> with more in-depth knowledge of the frequency governors can comment on this.

Anyway, I think we don't need to add the blocking delay, we can
legitimally expect that freq scaling is disabled when we run performance
tests.

> 
> > 
> > To have more precise tests with small size, would it make sense to
> > target a test time instead of an iteration count? Something like
> > this:
> > 
> 
> The time lost when running on a lower frequency (plus the hiccups when 
> the frequency is changed) will be amortized as you add to the length of 
> the test run, which will partly solved the problem. A better solution is 
> to not start the test before the core runs on the max frequency.
> 
> Again, this is assuming DVFS is what you suffer from here. I guess in 
> theory it could be TLB miss as well.
> 
> > 	#define ITERATIONS 1000000
> > 	uint64_t iterations = 0;
> > 
> > 	...
> > 
> > 	do {
> > 		for (i = 0; i < ITERATIONS; i++) {
> > 			unsigned int j;
> > 			for (j = 0; j < NUM_BLOCKS; j++)
> > 				sum += do_rte_raw_cksum(blocks[j], block_size);
> > 		}
> > 		iterations += ITERATIONS;
> > 		end = rte_rdtsc();
> > 	} while ((end - start) < rte_get_tsc_hz());
> > 
> > 	block_latency = (end - start) / (double)(iterations * NUM_BLOCKS);
> > 
> > 
> > After this change, the aligned and unaligned cases have the same
> > performance on my machine.
> > 
> > 
> 
> RTE>>cksum_perf_autotest
> ### rte_raw_cksum() performance ###
> Alignment  Block size    TSC cycles/block  TSC cycles/byte
> Aligned           20                16.1             0.81
> Unaligned         20                16.1             0.81
> 
> ... with the 100 ms busy-wait delay (and frequency scaling enabled) on 
> my AMD machine.
> 
> >> +
> >> +	for (i = 0; i < NUM_BLOCKS; i++)
> >> +		rte_free(data[i]);
> >> +
> >> +	return TEST_SUCCESS;
> >> +}
> >> +
> >> +static int
> >> +test_cksum_perf_size(size_t block_size)
> >> +{
> >> +	int rc;
> >> +
> >> +	rc = test_cksum_perf_size_alignment(block_size, true);
> >> +	if (rc != TEST_SUCCESS)
> >> +		return rc;
> >> +
> >> +	rc = test_cksum_perf_size_alignment(block_size, false);
> >> +
> >> +	return rc;
> >> +}
> >> +
> >> +static int
> >> +test_cksum_perf(void)
> >> +{
> >> +	uint16_t i;
> >> +
> >> +	printf("### rte_raw_cksum() performance ###\n");
> >> +	printf("Alignment  Block size    TSC cycles/block  TSC cycles/byte\n");
> >> +
> >> +	for (i = 0; i < RTE_DIM(data_sizes); i++) {
> >> +		int rc;
> >> +
> >> +		rc = test_cksum_perf_size(data_sizes[i]);
> >> +		if (rc != TEST_SUCCESS)
> >> +			return rc;
> >> +	}
> >> +
> >> +	return TEST_SUCCESS;
> >> +}
> >> +
> >> +
> >> +REGISTER_TEST_COMMAND(cksum_perf_autotest, test_cksum_perf);
> >> +
> > 
> > The last empty line can be removed.
> > 
> 
> OK.
> 
> Thanks for the review. I will send a v3 as soon as we've settled the 
> DVFS issue.
> 
> >> -- 
> >> 2.25.1
> >>
>
  

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index c923712946..2a4c99e05a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1414,6 +1414,7 @@  Network headers
 M: Olivier Matz <olivier.matz@6wind.com>
 F: lib/net/
 F: app/test/test_cksum.c
+F: app/test/test_cksum_perf.c
 
 Packet CRC
 M: Jasvinder Singh <jasvinder.singh@intel.com>
diff --git a/app/test/meson.build b/app/test/meson.build
index 431c5bd318..191db03d1d 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -18,6 +18,7 @@  test_sources = files(
         'test_bpf.c',
         'test_byteorder.c',
         'test_cksum.c',
+        'test_cksum_perf.c',
         'test_cmdline.c',
         'test_cmdline_cirbuf.c',
         'test_cmdline_etheraddr.c',
diff --git a/app/test/test_cksum_perf.c b/app/test/test_cksum_perf.c
new file mode 100644
index 0000000000..bff73cb3bb
--- /dev/null
+++ b/app/test/test_cksum_perf.c
@@ -0,0 +1,118 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Ericsson AB
+ */
+
+#include <stdio.h>
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_ip.h>
+#include <rte_malloc.h>
+#include <rte_random.h>
+
+#include "test.h"
+
+#define NUM_BLOCKS (10)
+#define ITERATIONS (1000000)
+
+static const size_t data_sizes[] = { 20, 21, 100, 101, 1500, 1501 };
+
+static __rte_noinline uint16_t
+do_rte_raw_cksum(const void *buf, size_t len)
+{
+	return rte_raw_cksum(buf, len);
+}
+
+static void
+init_block(void *buf, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len; i++)
+		((char *)buf)[i] = (uint8_t)rte_rand();
+}
+
+static int
+test_cksum_perf_size_alignment(size_t block_size, bool aligned)
+{
+	char *data[NUM_BLOCKS];
+	char *blocks[NUM_BLOCKS];
+	unsigned int i;
+	uint64_t start;
+	uint64_t end;
+	/* Floating point to handle low (pseudo-)TSC frequencies */
+	double block_latency;
+	double byte_latency;
+	volatile __rte_unused uint64_t sum = 0;
+
+	for (i = 0; i < NUM_BLOCKS; i++) {
+		data[i] = rte_malloc(NULL, block_size + 1, 0);
+
+		if (data[i] == NULL) {
+			printf("Failed to allocate memory for block\n");
+			return TEST_FAILED;
+		}
+
+		init_block(data[i], block_size + 1);
+
+		blocks[i] = aligned ? data[i] : data[i] + 1;
+	}
+
+	start = rte_rdtsc();
+
+	for (i = 0; i < ITERATIONS; i++) {
+		unsigned int j;
+		for (j = 0; j < NUM_BLOCKS; j++)
+			sum += do_rte_raw_cksum(blocks[j], block_size);
+	}
+
+	end = rte_rdtsc();
+
+	block_latency = (end - start) / (double)(ITERATIONS * NUM_BLOCKS);
+	byte_latency = block_latency / block_size;
+
+	printf("%-9s %10zd %19.1f %16.2f\n", aligned ? "Aligned" : "Unaligned",
+	       block_size, block_latency, byte_latency);
+
+	for (i = 0; i < NUM_BLOCKS; i++)
+		rte_free(data[i]);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_cksum_perf_size(size_t block_size)
+{
+	int rc;
+
+	rc = test_cksum_perf_size_alignment(block_size, true);
+	if (rc != TEST_SUCCESS)
+		return rc;
+
+	rc = test_cksum_perf_size_alignment(block_size, false);
+
+	return rc;
+}
+
+static int
+test_cksum_perf(void)
+{
+	uint16_t i;
+
+	printf("### rte_raw_cksum() performance ###\n");
+	printf("Alignment  Block size    TSC cycles/block  TSC cycles/byte\n");
+
+	for (i = 0; i < RTE_DIM(data_sizes); i++) {
+		int rc;
+
+		rc = test_cksum_perf_size(data_sizes[i]);
+		if (rc != TEST_SUCCESS)
+			return rc;
+	}
+
+	return TEST_SUCCESS;
+}
+
+
+REGISTER_TEST_COMMAND(cksum_perf_autotest, test_cksum_perf);
+