[v2,3/3] app/testpmd: qualify profiling statistics on burst size

Message ID 1584625851-10291-4-git-send-email-viacheslavo@mellanox.com (mailing list archive)
State Changes Requested, archived
Delegated to: Ferruh Yigit
Headers
Series app/testpmd: qualify Rx/Tx profiling data on burst size |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/travis-robot success Travis build: passed
ci/Intel-compilation success Compilation OK

Commit Message

Slava Ovsiienko March 19, 2020, 1:50 p.m. UTC
  The execution time of rx/tx burst routine depends on the burst size.
It would be meaningful to research this dependency, the patch
provides an extra profiling data per rx/tx burst size.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 app/test-pmd/csumonly.c   | 11 +++++++----
 app/test-pmd/flowgen.c    | 11 +++++++----
 app/test-pmd/icmpecho.c   | 12 ++++++++----
 app/test-pmd/iofwd.c      | 11 +++++++----
 app/test-pmd/macfwd.c     | 11 +++++++----
 app/test-pmd/macswap.c    | 11 +++++++----
 app/test-pmd/rxonly.c     |  2 +-
 app/test-pmd/softnicfwd.c | 11 +++++++----
 app/test-pmd/testpmd.c    | 31 ++++++++++++++++++++++++-------
 app/test-pmd/testpmd.h    | 26 ++++++++++++++++++++++----
 app/test-pmd/txonly.c     |  9 ++++++---
 11 files changed, 103 insertions(+), 43 deletions(-)
  

Comments

Jerin Jacob March 20, 2020, 6:13 a.m. UTC | #1
On Thu, Mar 19, 2020 at 7:21 PM Viacheslav Ovsiienko
<viacheslavo@mellanox.com> wrote:
>
> The execution time of rx/tx burst routine depends on the burst size.
> It would be meaningful to research this dependency, the patch
> provides an extra profiling data per rx/tx burst size.
>
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>

> +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> +       if (!(fwdprof_flags & (nrx_tx ? RECORD_CORE_CYCLES_TX
> +                                     : RECORD_CORE_CYCLES_RX)))
> +               return;
> +       for (nb_pkt = 0; nb_pkt < MAX_PKT_BURST; nb_pkt++) {
> +               nb_burst = nrx_tx ? pbs->pkt_retry_spread[nb_pkt]
> +                                 : pbs->pkt_burst_spread[nb_pkt];
> +               if (nb_burst == 0)
> +                       continue;
> +               printf("  CPU cycles/%u packet burst=%u (total cycles="
> +                      "%"PRIu64" / total %s bursts=%u)\n",
> +                      (unsigned int)nb_pkt,
> +                      (unsigned int)(pbs->pkt_ticks_spread[nb_pkt] / nb_burst),
> +                      pbs->pkt_ticks_spread[nb_pkt],
> +                      nrx_tx ? "TX" : "RX", nb_burst);
> +       }
> +#endif


# Thanks for this feature, IMO, It worth to mention in release notes.

# I see a lot of code get added under RTE_TEST_PMD_RECORD_CORE_CYCLES.
I agree that it should be under RTE_TEST_PMD_RECORD_CORE_CYCLES. Since
this flag is not
by default, there is a risk of compilation issue when this flag is get enabled.
IMO, it is better to move to _if (0)_ semantics to enable
- performance when compiler opt-out the code.
- It forces the compiler to check the compilation errors irrespective
of the RTE_TEST_PMD_RECORD_CORE_CYCLES state.

Something like below,

static __rte_always_inline int
testpmd_has_stats_feature(void)
{
#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
        return RTE_TEST_PMD_RECORD_CORE_CYCLES ;
#else
        return 0;
#endif
}


if (testpmd_has_stats_feature()) {

}
  
Andrzej Ostruszka March 20, 2020, 4:03 p.m. UTC | #2
Viacheslav, thanks for the patches.  I'll comment here on the whole
patch set.  This patch set has many similar changes so I'll focus on the
first (csumonly) and the comment will apply to all remaining - OK?

On 3/19/20 2:50 PM, Viacheslav Ovsiienko wrote:
> The execution time of rx/tx burst routine depends on the burst size.
> It would be meaningful to research this dependency, the patch
> provides an extra profiling data per rx/tx burst size.
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> ---
>  app/test-pmd/csumonly.c   | 11 +++++++----
>  app/test-pmd/flowgen.c    | 11 +++++++----
>  app/test-pmd/icmpecho.c   | 12 ++++++++----
>  app/test-pmd/iofwd.c      | 11 +++++++----
>  app/test-pmd/macfwd.c     | 11 +++++++----
>  app/test-pmd/macswap.c    | 11 +++++++----
>  app/test-pmd/rxonly.c     |  2 +-
>  app/test-pmd/softnicfwd.c | 11 +++++++----
>  app/test-pmd/testpmd.c    | 31 ++++++++++++++++++++++++-------
>  app/test-pmd/testpmd.h    | 26 ++++++++++++++++++++++----
>  app/test-pmd/txonly.c     |  9 ++++++---
>  11 files changed, 103 insertions(+), 43 deletions(-)
> 
> diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
> index 4104737..c966892 100644
> --- a/app/test-pmd/csumonly.c
> +++ b/app/test-pmd/csumonly.c
> @@ -797,7 +797,7 @@ struct simple_gre_hdr {
>  	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
>  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
>  				 nb_pkt_per_burst);
> -	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
> +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc, nb_rx);
>  	if (unlikely(nb_rx == 0))
>  		return;
>  #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
> @@ -1067,7 +1067,7 @@ struct simple_gre_hdr {
>  	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, tx_pkts_burst,
>  			nb_prep);
> -	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> +	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
>  
>  	/*
>  	 * Retry if necessary
> @@ -1075,11 +1075,14 @@ struct simple_gre_hdr {
>  	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
>  		retry = 0;
>  		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
> +			uint16_t nb_rt;
> +
>  			rte_delay_us(burst_tx_delay_time);
>  			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> -			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
> +			nb_rt = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
>  					&tx_pkts_burst[nb_tx], nb_rx - nb_tx);
> -			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> +			nb_tx += nb_rt;
> +			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_rt);
>  		}
>  	}
>  	fs->tx_packets += nb_tx;

Below this line there is (not visible in patch) code (I'll name it as
"total tx bump" in order to refer to it below):

#ifdef RTE_TEST_PMD_RECORD_BURST_STATS
	fs->tx_burst_stats.pkt_burst_spread[nb_tx]++;
#endif

So when the BURST_STATS is enabled you will have:

struct pkt_burst_stats {
	unsigned int pkt_burst_spread[MAX_PKT_BURST];
	unsigned int pkt_retry_spread[MAX_PKT_BURST];
	uint64_t pkt_ticks_spread[MAX_PKT_BURST];
};

* pkt_ticks_spread will contain ticks per every tx_burst attempt
  (spreaded by the number of packets sent),
* pkt_retry_spread will contain number of tx_burst attempts (spreaded
  by the number of packets sent),
* pkt_burst_spread will have logged +1 for each each bucket that is a
  sum of all packets sent in given rx/tx loop iteration

What the last item actually gives the user that cannot be infered from
the middle one?  If I want to have total number of packets sent I can
sum: slot*spread[slot] over slots.  This should give the same result
when done over burst_spread and retry_spread.  I might be missing
something here so could you elaborate - to me that "total tx bump" looks
like an artificial stat?

If there is actually nothing substantial there, then I'd suggest to
remove pkt_retry_spread (that name actually is also wrong for the sunny
day scenario when there are no retransmissions) and use pkt_burst_spread
for that purpose.  AFAIU the 'retry' member is not used at all for the
RX path - correct?

So my suggestion would be to:
- keep only pkt_burst_spread and pkt_ticks_spread
- use them properly per every rx/tx_burst attempt
- skip the "total tx bump" code

That way RX/TX stats would be symmetrical and reflect the actual
rx/tx_burst attempts.

[...]
> diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
> index b195880..1d4b55b 100644
> --- a/app/test-pmd/testpmd.c
> +++ b/app/test-pmd/testpmd.c
> @@ -1515,7 +1515,7 @@ struct extmem_param {
>  
>  #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
>  static void
> -pkt_burst_stats_display(const char *rx_tx, struct pkt_burst_stats *pbs)
> +pkt_burst_stats_display(int nrx_tx, struct pkt_burst_stats *pbs)

Maybe another name for 'nrx_tx'?  Like 'is_tx' or 'tx_stats'?  And maybe
bool?

>  {
>  	unsigned int total_burst;
>  	unsigned int nb_burst;
> @@ -1549,8 +1549,8 @@ struct extmem_param {
>  	if (total_burst == 0)
>  		return;
>  	burst_percent[0] = (burst_stats[0] * 100) / total_burst;
> -	printf("  %s-bursts : %u [%d%% of %d pkts", rx_tx, total_burst,
> -	       burst_percent[0], (int) pktnb_stats[0]);
> +	printf("  %s-bursts : %u [%d%% of %d pkts", nrx_tx ? "TX" : "RX",
> +	       total_burst, burst_percent[0], (int) pktnb_stats[0]);
>  	if (burst_stats[0] == total_burst) {
>  		printf("]\n");
>  		return;
> @@ -1568,6 +1568,23 @@ struct extmem_param {
>  	}
>  	printf(" + %d%% of %d pkts + %d%% of others]\n",
>  	       burst_percent[1], (int) pktnb_stats[1], burst_percent[2]);
> +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> +	if (!(fwdprof_flags & (nrx_tx ? RECORD_CORE_CYCLES_TX
> +				      : RECORD_CORE_CYCLES_RX)))
> +		return;
> +	for (nb_pkt = 0; nb_pkt < MAX_PKT_BURST; nb_pkt++) {
> +		nb_burst = nrx_tx ? pbs->pkt_retry_spread[nb_pkt]
> +				  : pbs->pkt_burst_spread[nb_pkt];

So you actually don't use pkt_burst_spread for TX.  Then just follow my
suggestion above and remove 'retry' and use pkt_burst_spread as you use
'retry' now.

> +		if (nb_burst == 0)
> +			continue;
> +		printf("  CPU cycles/%u packet burst=%u (total cycles="
> +		       "%"PRIu64" / total %s bursts=%u)\n",
> +		       (unsigned int)nb_pkt,
> +		       (unsigned int)(pbs->pkt_ticks_spread[nb_pkt] / nb_burst),
> +		       pbs->pkt_ticks_spread[nb_pkt],
> +		       nrx_tx ? "TX" : "RX", nb_burst);
> +	}
> +#endif
>  }
>  #endif /* RTE_TEST_PMD_RECORD_BURST_STATS */
[...]
> --- a/app/test-pmd/testpmd.h
> +++ b/app/test-pmd/testpmd.h
> @@ -89,6 +89,10 @@ enum {
>   */
>  struct pkt_burst_stats {
>  	unsigned int pkt_burst_spread[MAX_PKT_BURST];
> +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> +	unsigned int pkt_retry_spread[MAX_PKT_BURST];
> +	uint64_t pkt_ticks_spread[MAX_PKT_BURST];
> +#endif
>  };
>  #endif
>  
> @@ -340,21 +344,35 @@ struct queue_stats_mappings {
>  {if (fwdprof_flags & RECORD_CORE_CYCLES_FWD) \
>  {uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_cycles += tsc; } }

General comment to the macros below - such macros are usually defined
with the "traditional":
	do { ... } while (0)
structure.  That way they behave better when used in conditional statements:
	if (cond)
		MACRO(args);
	else
		...
With the definition below the "else" would produce compiler error.

I know that they are not used that way, but I'd say it would be cleaner
to do that way anyway :).

> -#define TEST_PMD_CORE_CYC_TX_ADD(fs, s) \
> +#ifdef RTE_TEST_PMD_RECORD_BURST_STATS
> +#define TEST_PMD_CORE_CYC_TX_ADD(fs, s, np) \
> +{if (fwdprof_flags & RECORD_CORE_CYCLES_TX) \
> +{uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_tx_cycles += tsc; \
> +fs->tx_burst_stats.pkt_ticks_spread[np] += tsc; \
> +fs->tx_burst_stats.pkt_retry_spread[np]++; } }
> +
> +#define TEST_PMD_CORE_CYC_RX_ADD(fs, s, np) \
> +{if (fwdprof_flags & RECORD_CORE_CYCLES_RX) \
> +{uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_rx_cycles += tsc; \
> +fs->rx_burst_stats.pkt_ticks_spread[np] += tsc; } }
> +
> +#else /* RTE_TEST_PMD_RECORD_BURST_STATS */
> +#define TEST_PMD_CORE_CYC_TX_ADD(fs, s, np) \
>  {if (fwdprof_flags & RECORD_CORE_CYCLES_TX) \
>  {uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_tx_cycles += tsc; } }
>  
> -#define TEST_PMD_CORE_CYC_RX_ADD(fs, s) \
> +#define TEST_PMD_CORE_CYC_RX_ADD(fs, s, np) \
>  {if (fwdprof_flags & RECORD_CORE_CYCLES_RX) \
>  {uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_rx_cycles += tsc; } }
> +#endif /* RTE_TEST_PMD_RECORD_BURST_STATS */

In addition to the above "do/while(0)" comment maybe you could merge
these macros above like (not tested):

#ifdef RTE_TEST_PMD_RECORD_BURST_STATS
#define TEST_PMD_BURST_STATS_ADD(stats, np, tsc) \
	stats.pkt_ticks_spread[np] += tsc; \
	stats.pkt_burst_spread[np]++
#else
#define TEST_PMD_BURST_STATS_ADD(stats, np, tsc)
#endif

#define TEST_PMD_CORE_CYC_TX_ADD(fs, s, np) \
	do { if (fwdprof_flags & RECORD_CORE_CYCLES_TX) { \
		uint64_t tsc = rte_rdtsc(); tsc -= (s); \
		fs->core_tx_cycles += tsc; \
		TEST_PMD_BURST_STATS_ADD(fs->tx_burst_stats, np, tsc); \
	}} while(0)

#define TEST_PMD_CORE_CYC_RX_ADD(fs, s, np) \
	do { if (fwdprof_flags & RECORD_CORE_CYCLES_RX) { \
		uint64_t tsc = rte_rdtsc(); tsc -= (s); \
		fs->core_rx_cycles += tsc; \
		TEST_PMD_BURST_STATS_ADD(fs->rx_burst_stats, np, tsc); \
	}} while(0)

That way handling of RX and TX would be very symmetrical - you'd just
call TX_ADD/RX_ADD where needed and you'd remove both "total TX bump"
and "total RX bump" code.

Note however that accepting this suggestion and removal of "total RX
bump" would also be a behaviour change - since currently when no packet
is read nothing happens (apart from the tics updated) whereas with the
proposed changes (and enabled burst stats) you'd be bumping
rx_burst_stats.pkt_burst_spread[0].  I'd say that would be change for
good since this would allow to have a glimpse how many iterations are idle.

>  #else
>  /* No profiling statistics is configured. */
>  #define TEST_PMD_CORE_CYC_TX_START(a)
>  #define TEST_PMD_CORE_CYC_RX_START(a)
>  #define TEST_PMD_CORE_CYC_FWD_ADD(fs, s)
> -#define TEST_PMD_CORE_CYC_TX_ADD(fs, s)
> -#define TEST_PMD_CORE_CYC_RX_ADD(fs, s)
> +#define TEST_PMD_CORE_CYC_TX_ADD(fs, s, np)
> +#define TEST_PMD_CORE_CYC_RX_ADD(fs, s, np)
>  #endif /* RTE_TEST_PMD_RECORD_CORE_CYCLES */
[...]

With regards
Andrzej Ostruszka

Reviewed-by: Andrzej Ostruszka <aostruszka@marvell.com>
  
Thomas Monjalon April 2, 2020, 11:21 a.m. UTC | #3
19/03/2020 14:50, Viacheslav Ovsiienko:
> +		printf("  CPU cycles/%u packet burst=%u (total cycles="
> +		       "%"PRIu64" / total %s bursts=%u)\n",

You can make it simpler by removing some words:
	CPU cycles -> cycles
	packet burst -> burst
  
Ferruh Yigit April 9, 2020, 11:46 a.m. UTC | #4
On 3/20/2020 6:13 AM, Jerin Jacob wrote:
> On Thu, Mar 19, 2020 at 7:21 PM Viacheslav Ovsiienko
> <viacheslavo@mellanox.com> wrote:
>>
>> The execution time of rx/tx burst routine depends on the burst size.
>> It would be meaningful to research this dependency, the patch
>> provides an extra profiling data per rx/tx burst size.
>>
>> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> 
>> +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
>> +       if (!(fwdprof_flags & (nrx_tx ? RECORD_CORE_CYCLES_TX
>> +                                     : RECORD_CORE_CYCLES_RX)))
>> +               return;
>> +       for (nb_pkt = 0; nb_pkt < MAX_PKT_BURST; nb_pkt++) {
>> +               nb_burst = nrx_tx ? pbs->pkt_retry_spread[nb_pkt]
>> +                                 : pbs->pkt_burst_spread[nb_pkt];
>> +               if (nb_burst == 0)
>> +                       continue;
>> +               printf("  CPU cycles/%u packet burst=%u (total cycles="
>> +                      "%"PRIu64" / total %s bursts=%u)\n",
>> +                      (unsigned int)nb_pkt,
>> +                      (unsigned int)(pbs->pkt_ticks_spread[nb_pkt] / nb_burst),
>> +                      pbs->pkt_ticks_spread[nb_pkt],
>> +                      nrx_tx ? "TX" : "RX", nb_burst);
>> +       }
>> +#endif
> 
> 
> # Thanks for this feature, IMO, It worth to mention in release notes.
> 
> # I see a lot of code get added under RTE_TEST_PMD_RECORD_CORE_CYCLES.
> I agree that it should be under RTE_TEST_PMD_RECORD_CORE_CYCLES. Since
> this flag is not
> by default, there is a risk of compilation issue when this flag is get enabled.
> IMO, it is better to move to _if (0)_ semantics to enable
> - performance when compiler opt-out the code.
> - It forces the compiler to check the compilation errors irrespective
> of the RTE_TEST_PMD_RECORD_CORE_CYCLES state.
> 
> Something like below,
> 
> static __rte_always_inline int
> testpmd_has_stats_feature(void)
> {
> #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
>         return RTE_TEST_PMD_RECORD_CORE_CYCLES ;
> #else
>         return 0;
> #endif
> }
> 
> 
> if (testpmd_has_stats_feature()) {
> 
> }
> 

Hi Jerin,

In this usage, compiler will removed the "if (0) { }" block, right?
I think this is good idea, we can use it other places too, including this one.
  
Ferruh Yigit April 9, 2020, 12:03 p.m. UTC | #5
On 3/19/2020 1:50 PM, Viacheslav Ovsiienko wrote:
> The execution time of rx/tx burst routine depends on the burst size.
> It would be meaningful to research this dependency, the patch
> provides an extra profiling data per rx/tx burst size.
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>

<...>

> @@ -1568,6 +1568,23 @@ struct extmem_param {
>  	}
>  	printf(" + %d%% of %d pkts + %d%% of others]\n",
>  	       burst_percent[1], (int) pktnb_stats[1], burst_percent[2]);
> +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> +	if (!(fwdprof_flags & (nrx_tx ? RECORD_CORE_CYCLES_TX
> +				      : RECORD_CORE_CYCLES_RX)))
> +		return;
> +	for (nb_pkt = 0; nb_pkt < MAX_PKT_BURST; nb_pkt++) {
> +		nb_burst = nrx_tx ? pbs->pkt_retry_spread[nb_pkt]
> +				  : pbs->pkt_burst_spread[nb_pkt];
> +		if (nb_burst == 0)
> +			continue;

Why "nb_burst == 0" excluded, wouldn't be interesting to see number of calls
with 0 Rx/Tx and cycles spent there?

> +		printf("  CPU cycles/%u packet burst=%u (total cycles="
> +		       "%"PRIu64" / total %s bursts=%u)\n",
> +		       (unsigned int)nb_pkt,
> +		       (unsigned int)(pbs->pkt_ticks_spread[nb_pkt] / nb_burst),
> +		       pbs->pkt_ticks_spread[nb_pkt],
> +		       nrx_tx ? "TX" : "RX", nb_burst);
> +	}
> +#endif
>  }
>  #endif /* RTE_TEST_PMD_RECORD_BURST_STATS */
>  
> @@ -1601,8 +1618,8 @@ struct extmem_param {
>  	}
>  
>  #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
> -	pkt_burst_stats_display("RX", &fs->rx_burst_stats);
> -	pkt_burst_stats_display("TX", &fs->tx_burst_stats);
> +	pkt_burst_stats_display(false, &fs->rx_burst_stats);
> +	pkt_burst_stats_display(true, &fs->tx_burst_stats);

Instead of this true/false, I would be OK to some duplication and have explicit
Rx/Tx, I believe it would be more clear that way, but no strong opinion on it.

<...>

> +++ b/app/test-pmd/testpmd.h
> @@ -89,6 +89,10 @@ enum {
>   */
>  struct pkt_burst_stats {
>  	unsigned int pkt_burst_spread[MAX_PKT_BURST];
> +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> +	unsigned int pkt_retry_spread[MAX_PKT_BURST];

Isn't this keep the value of all Tx burst count, why named as 'retry'?
  
Thomas Monjalon April 9, 2020, 12:09 p.m. UTC | #6
09/04/2020 14:03, Ferruh Yigit:
> On 3/19/2020 1:50 PM, Viacheslav Ovsiienko wrote:
> > -	pkt_burst_stats_display("RX", &fs->rx_burst_stats);
> > -	pkt_burst_stats_display("TX", &fs->tx_burst_stats);
> > +	pkt_burst_stats_display(false, &fs->rx_burst_stats);
> > +	pkt_burst_stats_display(true, &fs->tx_burst_stats);
> 
> Instead of this true/false, I would be OK to some duplication and have explicit
> Rx/Tx, I believe it would be more clear that way, but no strong opinion on it.

I have a strong opinion here: Rx/Tx is not a boolean.
If not a string, it should be a constant (#define).
  
Jerin Jacob April 9, 2020, 12:49 p.m. UTC | #7
On Thu, Apr 9, 2020 at 5:16 PM Ferruh Yigit <ferruh.yigit@intel.com> wrote:
>
> On 3/20/2020 6:13 AM, Jerin Jacob wrote:
> > On Thu, Mar 19, 2020 at 7:21 PM Viacheslav Ovsiienko
> > <viacheslavo@mellanox.com> wrote:
> >>
> >> The execution time of rx/tx burst routine depends on the burst size.
> >> It would be meaningful to research this dependency, the patch
> >> provides an extra profiling data per rx/tx burst size.
> >>
> >> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> >
> >> +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> >> +       if (!(fwdprof_flags & (nrx_tx ? RECORD_CORE_CYCLES_TX
> >> +                                     : RECORD_CORE_CYCLES_RX)))
> >> +               return;
> >> +       for (nb_pkt = 0; nb_pkt < MAX_PKT_BURST; nb_pkt++) {
> >> +               nb_burst = nrx_tx ? pbs->pkt_retry_spread[nb_pkt]
> >> +                                 : pbs->pkt_burst_spread[nb_pkt];
> >> +               if (nb_burst == 0)
> >> +                       continue;
> >> +               printf("  CPU cycles/%u packet burst=%u (total cycles="
> >> +                      "%"PRIu64" / total %s bursts=%u)\n",
> >> +                      (unsigned int)nb_pkt,
> >> +                      (unsigned int)(pbs->pkt_ticks_spread[nb_pkt] / nb_burst),
> >> +                      pbs->pkt_ticks_spread[nb_pkt],
> >> +                      nrx_tx ? "TX" : "RX", nb_burst);
> >> +       }
> >> +#endif
> >
> >
> > # Thanks for this feature, IMO, It worth to mention in release notes.
> >
> > # I see a lot of code get added under RTE_TEST_PMD_RECORD_CORE_CYCLES.
> > I agree that it should be under RTE_TEST_PMD_RECORD_CORE_CYCLES. Since
> > this flag is not
> > by default, there is a risk of compilation issue when this flag is get enabled.
> > IMO, it is better to move to _if (0)_ semantics to enable
> > - performance when compiler opt-out the code.
> > - It forces the compiler to check the compilation errors irrespective
> > of the RTE_TEST_PMD_RECORD_CORE_CYCLES state.
> >
> > Something like below,
> >
> > static __rte_always_inline int
> > testpmd_has_stats_feature(void)
> > {
> > #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> >         return RTE_TEST_PMD_RECORD_CORE_CYCLES ;
> > #else
> >         return 0;
> > #endif
> > }
> >
> >
> > if (testpmd_has_stats_feature()) {
> >
> > }
> >
>
> Hi Jerin,

Hi Ferruh,

> In this usage, compiler will removed the "if (0) { }" block, right?

Yes.

> I think this is good idea, we can use it other places too, including this one.

Yes.
  

Patch

diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index 4104737..c966892 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -797,7 +797,7 @@  struct simple_gre_hdr {
 	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
-	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc, nb_rx);
 	if (unlikely(nb_rx == 0))
 		return;
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
@@ -1067,7 +1067,7 @@  struct simple_gre_hdr {
 	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, tx_pkts_burst,
 			nb_prep);
-	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
 
 	/*
 	 * Retry if necessary
@@ -1075,11 +1075,14 @@  struct simple_gre_hdr {
 	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
+			uint16_t nb_rt;
+
 			rte_delay_us(burst_tx_delay_time);
 			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
-			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
+			nb_rt = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&tx_pkts_burst[nb_tx], nb_rx - nb_tx);
-			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+			nb_tx += nb_rt;
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_rt);
 		}
 	}
 	fs->tx_packets += nb_tx;
diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 51e87b0..9189e7b 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -107,7 +107,7 @@ 
 	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
-	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc, nb_rx);
 	fs->rx_packets += nb_rx;
 
 	for (i = 0; i < nb_rx; i++)
@@ -179,18 +179,21 @@ 
 
 	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
-	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
 	/*
 	 * Retry if necessary
 	 */
 	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
+			uint16_t nb_rt;
+
 			rte_delay_us(burst_tx_delay_time);
 			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
-			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
+			nb_rt = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
-			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+			nb_tx += nb_rt;
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_rt);
 		}
 	}
 	fs->tx_packets += nb_tx;
diff --git a/app/test-pmd/icmpecho.c b/app/test-pmd/icmpecho.c
index 8843183..0c8a8af 100644
--- a/app/test-pmd/icmpecho.c
+++ b/app/test-pmd/icmpecho.c
@@ -304,7 +304,7 @@ 
 	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
-	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc, nb_rx);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -492,7 +492,7 @@ 
 		TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 		nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
 					 nb_replies);
-		TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+		TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
 		/*
 		 * Retry if necessary
 		 */
@@ -500,13 +500,17 @@ 
 			retry = 0;
 			while (nb_tx < nb_replies &&
 					retry++ < burst_tx_retry_num) {
+				uint16_t nb_rt;
+
 				rte_delay_us(burst_tx_delay_time);
 				TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
-				nb_tx += rte_eth_tx_burst(fs->tx_port,
+				nb_rt = rte_eth_tx_burst(fs->tx_port,
 						fs->tx_queue,
 						&pkts_burst[nb_tx],
 						nb_replies - nb_tx);
-				TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+				nb_tx += nb_rt;
+				TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc,
+							 nb_rt);
 			}
 		}
 		fs->tx_packets += nb_tx;
diff --git a/app/test-pmd/iofwd.c b/app/test-pmd/iofwd.c
index 9ff6531..b05ed02 100644
--- a/app/test-pmd/iofwd.c
+++ b/app/test-pmd/iofwd.c
@@ -62,7 +62,7 @@ 
 	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
 			pkts_burst, nb_pkt_per_burst);
-	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc, nb_rx);
 	if (unlikely(nb_rx == 0))
 		return;
 	fs->rx_packets += nb_rx;
@@ -73,18 +73,21 @@ 
 	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 			pkts_burst, nb_rx);
-	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
 	/*
 	 * Retry if necessary
 	 */
 	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
+			uint16_t nb_rt;
+
 			rte_delay_us(burst_tx_delay_time);
 			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
-			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
+			nb_rt = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
-			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+			nb_tx += nb_rt;
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_rt);
 		}
 	}
 	fs->tx_packets += nb_tx;
diff --git a/app/test-pmd/macfwd.c b/app/test-pmd/macfwd.c
index f4a213e..a4aae0c 100644
--- a/app/test-pmd/macfwd.c
+++ b/app/test-pmd/macfwd.c
@@ -67,7 +67,7 @@ 
 	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
-	TEST_PMD_CORE_CYC_RX_ADD(fs, start_tx_tsc);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_tx_tsc, nb_rx);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -102,18 +102,21 @@ 
 	}
 	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
-	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
 	/*
 	 * Retry if necessary
 	 */
 	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
+			uint16_t nb_rt;
+
 			rte_delay_us(burst_tx_delay_time);
 			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
-			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
+			nb_rt = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
-			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+			nb_tx += nb_rt;
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_rt);
 		}
 	}
 
diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index 5cb3133..57628ba 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -68,7 +68,7 @@ 
 	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
-	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc, nb_rx);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -82,18 +82,21 @@ 
 
 	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
-	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
 	/*
 	 * Retry if necessary
 	 */
 	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
+			uint16_t nb_rt;
+
 			rte_delay_us(burst_tx_delay_time);
 			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
-			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
+			nb_rt = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
-			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+			nb_tx += nb_rt;
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_rt);
 		}
 	}
 	fs->tx_packets += nb_tx;
diff --git a/app/test-pmd/rxonly.c b/app/test-pmd/rxonly.c
index 2820d7f..ee79e7b 100644
--- a/app/test-pmd/rxonly.c
+++ b/app/test-pmd/rxonly.c
@@ -60,7 +60,7 @@ 
 	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
-	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc, nb_rx);
 	if (unlikely(nb_rx == 0))
 		return;
 
diff --git a/app/test-pmd/softnicfwd.c b/app/test-pmd/softnicfwd.c
index b78f2ce..793677d 100644
--- a/app/test-pmd/softnicfwd.c
+++ b/app/test-pmd/softnicfwd.c
@@ -96,7 +96,7 @@  struct tm_hierarchy {
 	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
 			pkts_burst, nb_pkt_per_burst);
-	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc, nb_rx);
 	fs->rx_packets += nb_rx;
 
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
@@ -106,17 +106,20 @@  struct tm_hierarchy {
 	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 			pkts_burst, nb_rx);
-	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
 
 	/* Retry if necessary */
 	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
+			uint16_t nb_rt;
+
 			rte_delay_us(burst_tx_delay_time);
 			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
-			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
+			nb_rt = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
-			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+			nb_tx += nb_rt;
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_rt);
 		}
 	}
 	fs->tx_packets += nb_tx;
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index b195880..1d4b55b 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1515,7 +1515,7 @@  struct extmem_param {
 
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
 static void
-pkt_burst_stats_display(const char *rx_tx, struct pkt_burst_stats *pbs)
+pkt_burst_stats_display(int nrx_tx, struct pkt_burst_stats *pbs)
 {
 	unsigned int total_burst;
 	unsigned int nb_burst;
@@ -1549,8 +1549,8 @@  struct extmem_param {
 	if (total_burst == 0)
 		return;
 	burst_percent[0] = (burst_stats[0] * 100) / total_burst;
-	printf("  %s-bursts : %u [%d%% of %d pkts", rx_tx, total_burst,
-	       burst_percent[0], (int) pktnb_stats[0]);
+	printf("  %s-bursts : %u [%d%% of %d pkts", nrx_tx ? "TX" : "RX",
+	       total_burst, burst_percent[0], (int) pktnb_stats[0]);
 	if (burst_stats[0] == total_burst) {
 		printf("]\n");
 		return;
@@ -1568,6 +1568,23 @@  struct extmem_param {
 	}
 	printf(" + %d%% of %d pkts + %d%% of others]\n",
 	       burst_percent[1], (int) pktnb_stats[1], burst_percent[2]);
+#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
+	if (!(fwdprof_flags & (nrx_tx ? RECORD_CORE_CYCLES_TX
+				      : RECORD_CORE_CYCLES_RX)))
+		return;
+	for (nb_pkt = 0; nb_pkt < MAX_PKT_BURST; nb_pkt++) {
+		nb_burst = nrx_tx ? pbs->pkt_retry_spread[nb_pkt]
+				  : pbs->pkt_burst_spread[nb_pkt];
+		if (nb_burst == 0)
+			continue;
+		printf("  CPU cycles/%u packet burst=%u (total cycles="
+		       "%"PRIu64" / total %s bursts=%u)\n",
+		       (unsigned int)nb_pkt,
+		       (unsigned int)(pbs->pkt_ticks_spread[nb_pkt] / nb_burst),
+		       pbs->pkt_ticks_spread[nb_pkt],
+		       nrx_tx ? "TX" : "RX", nb_burst);
+	}
+#endif
 }
 #endif /* RTE_TEST_PMD_RECORD_BURST_STATS */
 
@@ -1601,8 +1618,8 @@  struct extmem_param {
 	}
 
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
-	pkt_burst_stats_display("RX", &fs->rx_burst_stats);
-	pkt_burst_stats_display("TX", &fs->tx_burst_stats);
+	pkt_burst_stats_display(false, &fs->rx_burst_stats);
+	pkt_burst_stats_display(true, &fs->tx_burst_stats);
 #endif
 }
 
@@ -1742,10 +1759,10 @@  struct extmem_param {
 
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
 		if (ports_stats[pt_id].rx_stream)
-			pkt_burst_stats_display("RX",
+			pkt_burst_stats_display(false,
 				&ports_stats[pt_id].rx_stream->rx_burst_stats);
 		if (ports_stats[pt_id].tx_stream)
-			pkt_burst_stats_display("TX",
+			pkt_burst_stats_display(true,
 				&ports_stats[pt_id].tx_stream->tx_burst_stats);
 #endif
 
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 6177a50..90eb0ef 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -89,6 +89,10 @@  enum {
  */
 struct pkt_burst_stats {
 	unsigned int pkt_burst_spread[MAX_PKT_BURST];
+#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
+	unsigned int pkt_retry_spread[MAX_PKT_BURST];
+	uint64_t pkt_ticks_spread[MAX_PKT_BURST];
+#endif
 };
 #endif
 
@@ -340,21 +344,35 @@  struct queue_stats_mappings {
 {if (fwdprof_flags & RECORD_CORE_CYCLES_FWD) \
 {uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_cycles += tsc; } }
 
-#define TEST_PMD_CORE_CYC_TX_ADD(fs, s) \
+#ifdef RTE_TEST_PMD_RECORD_BURST_STATS
+#define TEST_PMD_CORE_CYC_TX_ADD(fs, s, np) \
+{if (fwdprof_flags & RECORD_CORE_CYCLES_TX) \
+{uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_tx_cycles += tsc; \
+fs->tx_burst_stats.pkt_ticks_spread[np] += tsc; \
+fs->tx_burst_stats.pkt_retry_spread[np]++; } }
+
+#define TEST_PMD_CORE_CYC_RX_ADD(fs, s, np) \
+{if (fwdprof_flags & RECORD_CORE_CYCLES_RX) \
+{uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_rx_cycles += tsc; \
+fs->rx_burst_stats.pkt_ticks_spread[np] += tsc; } }
+
+#else /* RTE_TEST_PMD_RECORD_BURST_STATS */
+#define TEST_PMD_CORE_CYC_TX_ADD(fs, s, np) \
 {if (fwdprof_flags & RECORD_CORE_CYCLES_TX) \
 {uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_tx_cycles += tsc; } }
 
-#define TEST_PMD_CORE_CYC_RX_ADD(fs, s) \
+#define TEST_PMD_CORE_CYC_RX_ADD(fs, s, np) \
 {if (fwdprof_flags & RECORD_CORE_CYCLES_RX) \
 {uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_rx_cycles += tsc; } }
+#endif /* RTE_TEST_PMD_RECORD_BURST_STATS */
 
 #else
 /* No profiling statistics is configured. */
 #define TEST_PMD_CORE_CYC_TX_START(a)
 #define TEST_PMD_CORE_CYC_RX_START(a)
 #define TEST_PMD_CORE_CYC_FWD_ADD(fs, s)
-#define TEST_PMD_CORE_CYC_TX_ADD(fs, s)
-#define TEST_PMD_CORE_CYC_RX_ADD(fs, s)
+#define TEST_PMD_CORE_CYC_TX_ADD(fs, s, np)
+#define TEST_PMD_CORE_CYC_RX_ADD(fs, s, np)
 #endif /* RTE_TEST_PMD_RECORD_CORE_CYCLES */
 
 /* globals used for configuration */
diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c
index 8ff7410..593044e 100644
--- a/app/test-pmd/txonly.c
+++ b/app/test-pmd/txonly.c
@@ -299,18 +299,21 @@ 
 
 	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
-	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
 	/*
 	 * Retry if necessary
 	 */
 	if (unlikely(nb_tx < nb_pkt) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_pkt && retry++ < burst_tx_retry_num) {
+			uint16_t nb_rt;
+
 			rte_delay_us(burst_tx_delay_time);
 			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
-			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
+			nb_rt = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_pkt - nb_tx);
-			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+			nb_tx += nb_rt;
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_rt);
 		}
 	}
 	fs->tx_packets += nb_tx;