[RFC] app/testpmd: add profiling for Rx/Tx burst routines

Message ID 1558936043-6259-1-git-send-email-viacheslavo@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers
Series [RFC] app/testpmd: add profiling for Rx/Tx burst routines |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail Compilation issues

Commit Message

Slava Ovsiienko May 27, 2019, 5:47 a.m. UTC
  There is the testpmd configuration option called
RTE_TEST_PMD_RECORD_CORE_CYCLES, if this one is turned on
the testpmd application measures the CPU clocks spent
within forwarding loop. This time is the sum of execution
times of rte_eth_rx_burst(), rte_eth_tx_burst(), rte_delay_us(),
 rte_pktmbuf_free() and so on, depending on fwd mode set.

While debugging and performance optimization of datapath
burst routines tt would be useful to see the pure execution
times of these ones. It is proposed to add separated profiling
options:

CONFIG_RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
    enables gathering profiling data for transmit datapath,
    ticks spent within rte_eth_tx_burst()

CONFIG_RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
    enables gathering profiling data for transmit datapath,
    ticks spent within rte_eth_rx_burst()

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 app/test-pmd/csumonly.c   | 25 ++++++++++++-------------
 app/test-pmd/flowgen.c    | 25 +++++++++++++------------
 app/test-pmd/icmpecho.c   | 26 +++++++++++++-------------
 app/test-pmd/iofwd.c      | 24 ++++++++++++------------
 app/test-pmd/macfwd.c     | 24 +++++++++++++-----------
 app/test-pmd/macswap.c    | 26 ++++++++++++++------------
 app/test-pmd/rxonly.c     | 17 ++++++-----------
 app/test-pmd/softnicfwd.c | 24 ++++++++++++------------
 app/test-pmd/testpmd.c    | 32 ++++++++++++++++++++++++++++++++
 app/test-pmd/testpmd.h    | 40 ++++++++++++++++++++++++++++++++++++++++
 app/test-pmd/txonly.c     | 23 +++++++++++------------
 config/common_base        |  2 ++
 12 files changed, 180 insertions(+), 108 deletions(-)
  

Comments

Iremonger, Bernard June 7, 2019, 4:07 p.m. UTC | #1
Hi Viacheslav,


> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Viacheslav Ovsiienko
> Sent: Monday, May 27, 2019 6:47 AM
> To: dev@dpdk.org
> Cc: Yigit, Ferruh <ferruh.yigit@intel.com>
> Subject: [dpdk-dev] [RFC] app/testpmd: add profiling for Rx/Tx burst routines
> 
> There is the testpmd configuration option called
> RTE_TEST_PMD_RECORD_CORE_CYCLES, if this one is turned on the testpmd
> application measures the CPU clocks spent within forwarding loop. This time is
> the sum of execution times of rte_eth_rx_burst(), rte_eth_tx_burst(),
> rte_delay_us(),
>  rte_pktmbuf_free() and so on, depending on fwd mode set.
> 
> While debugging and performance optimization of datapath burst routines tt
> would be useful to see the pure execution times of these ones. It is proposed to
> add separated profiling
> options:
> 
> CONFIG_RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
>     enables gathering profiling data for transmit datapath,
>     ticks spent within rte_eth_tx_burst()
> 
> CONFIG_RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
>     enables gathering profiling data for transmit datapath,
>     ticks spent within rte_eth_rx_burst()
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> ---
>  app/test-pmd/csumonly.c   | 25 ++++++++++++-------------
>  app/test-pmd/flowgen.c    | 25 +++++++++++++------------
>  app/test-pmd/icmpecho.c   | 26 +++++++++++++-------------
>  app/test-pmd/iofwd.c      | 24 ++++++++++++------------
>  app/test-pmd/macfwd.c     | 24 +++++++++++++-----------
>  app/test-pmd/macswap.c    | 26 ++++++++++++++------------
>  app/test-pmd/rxonly.c     | 17 ++++++-----------
>  app/test-pmd/softnicfwd.c | 24 ++++++++++++------------
>  app/test-pmd/testpmd.c    | 32 ++++++++++++++++++++++++++++++++
>  app/test-pmd/testpmd.h    | 40
> ++++++++++++++++++++++++++++++++++++++++
>  app/test-pmd/txonly.c     | 23 +++++++++++------------
>  config/common_base        |  2 ++
>  12 files changed, 180 insertions(+), 108 deletions(-)
> 
> diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c index
> f4f2a7b..251e179 100644
> --- a/app/test-pmd/csumonly.c
> +++ b/app/test-pmd/csumonly.c
> @@ -710,19 +710,19 @@ struct simple_gre_hdr {
>  	uint16_t nb_segments = 0;
>  	int ret;
> 
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	uint64_t start_tsc;
> -	uint64_t end_tsc;
> -	uint64_t core_cycles;
> +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
> +	uint64_t start_tx_tsc;

Should the RTE_TEST_PMD_RECORD_CORE_CYCLES macro be checked here too?

>  #endif
> -
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	start_tsc = rte_rdtsc();
> +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> +	uint64_t start_rx_tsc;
>  #endif
> 
>  	/* receive a burst of packet */
> +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
>  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
>  				 nb_pkt_per_burst);
> +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
>  	if (unlikely(nb_rx == 0))
>  		return;
>  #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
> @@ -982,8 +982,10 @@ struct simple_gre_hdr {
>  		printf("Preparing packet burst to transmit failed: %s\n",
>  				rte_strerror(rte_errno));
> 
> +	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, tx_pkts_burst,
>  			nb_prep);
> +	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> 
>  	/*
>  	 * Retry if necessary
> @@ -992,8 +994,10 @@ struct simple_gre_hdr {
>  		retry = 0;
>  		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
>  			rte_delay_us(burst_tx_delay_time);
> +			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
>  					&tx_pkts_burst[nb_tx], nb_rx - nb_tx);
> +			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
>  		}
>  	}
>  	fs->tx_packets += nb_tx;
> @@ -1010,12 +1014,7 @@ struct simple_gre_hdr {
>  			rte_pktmbuf_free(tx_pkts_burst[nb_tx]);
>  		} while (++nb_tx < nb_rx);
>  	}
> -
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	end_tsc = rte_rdtsc();
> -	core_cycles = (end_tsc - start_tsc);
> -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> -#endif
> +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
>  }
> 
>  struct fwd_engine csum_fwd_engine = {
> diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c index
> 3214e3c..b128e68 100644
> --- a/app/test-pmd/flowgen.c
> +++ b/app/test-pmd/flowgen.c
> @@ -130,20 +130,21 @@
>  	uint16_t i;
>  	uint32_t retry;
>  	uint64_t tx_offloads;
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	uint64_t start_tsc;
> -	uint64_t end_tsc;
> -	uint64_t core_cycles;
> -#endif
>  	static int next_flow = 0;
> 
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	start_tsc = rte_rdtsc();
> +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)

Should the RTE_TEST_PMD_RECORD_CORE_CYCLES macro be checked here too?

> +	uint64_t start_tx_tsc;
> +#endif
> +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> +	uint64_t start_rx_tsc;
>  #endif
> 
>  	/* Receive a burst of packets and discard them. */
> +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
>  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
>  				 nb_pkt_per_burst);
> +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
>  	fs->rx_packets += nb_rx;
> 
>  	for (i = 0; i < nb_rx; i++)
> @@ -212,7 +213,9 @@
>  		next_flow = (next_flow + 1) % cfg_n_flows;
>  	}
> 
> +	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
> +	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
>  	/*
>  	 * Retry if necessary
>  	 */
> @@ -220,8 +223,10 @@
>  		retry = 0;
>  		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
>  			rte_delay_us(burst_tx_delay_time);
> +			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
>  					&pkts_burst[nb_tx], nb_rx - nb_tx);
> +			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
>  		}
>  	}
>  	fs->tx_packets += nb_tx;
> @@ -239,11 +244,7 @@
>  			rte_pktmbuf_free(pkts_burst[nb_tx]);
>  		} while (++nb_tx < nb_pkt);
>  	}
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	end_tsc = rte_rdtsc();
> -	core_cycles = (end_tsc - start_tsc);
> -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> -#endif
> +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
>  }
> 
>  struct fwd_engine flow_gen_engine = {
> diff --git a/app/test-pmd/icmpecho.c b/app/test-pmd/icmpecho.c index
> 55d266d..a539fe8 100644
> --- a/app/test-pmd/icmpecho.c
> +++ b/app/test-pmd/icmpecho.c
> @@ -293,21 +293,22 @@
>  	uint32_t cksum;
>  	uint8_t  i;
>  	int l2_len;
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	uint64_t start_tsc;
> -	uint64_t end_tsc;
> -	uint64_t core_cycles;
> -#endif
> 
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	start_tsc = rte_rdtsc();
> +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
> +	uint64_t start_tx_tsc;
> +#endif
> +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> +	uint64_t start_rx_tsc;
>  #endif
> 
>  	/*
>  	 * First, receive a burst of packets.
>  	 */
> +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
>  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
>  				 nb_pkt_per_burst);
> +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
>  	if (unlikely(nb_rx == 0))
>  		return;
> 
> @@ -487,8 +488,10 @@
> 
>  	/* Send back ICMP echo replies, if any. */
>  	if (nb_replies > 0) {
> +		TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  		nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
>  					 nb_replies);
> +		TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
>  		/*
>  		 * Retry if necessary
>  		 */
> @@ -497,10 +500,12 @@
>  			while (nb_tx < nb_replies &&
>  					retry++ < burst_tx_retry_num) {
>  				rte_delay_us(burst_tx_delay_time);
> +
> 	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  				nb_tx += rte_eth_tx_burst(fs->tx_port,
>  						fs->tx_queue,
>  						&pkts_burst[nb_tx],
>  						nb_replies - nb_tx);
> +				TEST_PMD_CORE_CYC_TX_ADD(fs,
> start_tx_tsc);
>  			}
>  		}
>  		fs->tx_packets += nb_tx;
> @@ -514,12 +519,7 @@
>  			} while (++nb_tx < nb_replies);
>  		}
>  	}
> -
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	end_tsc = rte_rdtsc();
> -	core_cycles = (end_tsc - start_tsc);
> -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> -#endif
> +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
>  }
> 
>  struct fwd_engine icmp_echo_engine = {
> diff --git a/app/test-pmd/iofwd.c b/app/test-pmd/iofwd.c index
> 9dce76e..dc66a88 100644
> --- a/app/test-pmd/iofwd.c
> +++ b/app/test-pmd/iofwd.c
> @@ -51,21 +51,21 @@
>  	uint16_t nb_tx;
>  	uint32_t retry;
> 
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	uint64_t start_tsc;
> -	uint64_t end_tsc;
> -	uint64_t core_cycles;
> +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)

Should the RTE_TEST_PMD_RECORD_CORE_CYCLES macro be checked here too?

> +	uint64_t start_tx_tsc;
>  #endif
> -
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	start_tsc = rte_rdtsc();
> +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> +	uint64_t start_rx_tsc;
>  #endif
> 
>  	/*
>  	 * Receive a burst of packets and forward them.
>  	 */
> +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
>  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
>  			pkts_burst, nb_pkt_per_burst);
> +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
>  	if (unlikely(nb_rx == 0))
>  		return;
>  	fs->rx_packets += nb_rx;
> @@ -73,8 +73,10 @@
>  #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
>  	fs->rx_burst_stats.pkt_burst_spread[nb_rx]++;
>  #endif
> +	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
>  			pkts_burst, nb_rx);
> +	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
>  	/*
>  	 * Retry if necessary
>  	 */
> @@ -82,8 +84,10 @@
>  		retry = 0;
>  		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
>  			rte_delay_us(burst_tx_delay_time);
> +			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
>  					&pkts_burst[nb_tx], nb_rx - nb_tx);
> +			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
>  		}
>  	}
>  	fs->tx_packets += nb_tx;
> @@ -96,11 +100,7 @@
>  			rte_pktmbuf_free(pkts_burst[nb_tx]);
>  		} while (++nb_tx < nb_rx);
>  	}
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	end_tsc = rte_rdtsc();
> -	core_cycles = (end_tsc - start_tsc);
> -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> -#endif
> +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
>  }
> 
>  struct fwd_engine io_fwd_engine = {
> diff --git a/app/test-pmd/macfwd.c b/app/test-pmd/macfwd.c index
> 7cac757..2fd38ea 100644
> --- a/app/test-pmd/macfwd.c
> +++ b/app/test-pmd/macfwd.c
> @@ -56,21 +56,23 @@
>  	uint16_t i;
>  	uint64_t ol_flags = 0;
>  	uint64_t tx_offloads;
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	uint64_t start_tsc;
> -	uint64_t end_tsc;
> -	uint64_t core_cycles;
> +
> +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)

Should the RTE_TEST_PMD_RECORD_CORE_CYCLES macro be checked here too?

> +	uint64_t start_tx_tsc;
>  #endif
> +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> +	uint64_t start_rx_tsc;
> 
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	start_tsc = rte_rdtsc();
>  #endif
> 
>  	/*
>  	 * Receive a burst of packets and forward them.
>  	 */
> +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
>  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
>  				 nb_pkt_per_burst);
> +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
>  	if (unlikely(nb_rx == 0))
>  		return;
> 
> @@ -103,7 +105,9 @@
>  		mb->vlan_tci = txp->tx_vlan_id;
>  		mb->vlan_tci_outer = txp->tx_vlan_id_outer;
>  	}
> +	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
> +	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
>  	/*
>  	 * Retry if necessary
>  	 */
> @@ -111,8 +115,10 @@
>  		retry = 0;
>  		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
>  			rte_delay_us(burst_tx_delay_time);
> +			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
>  					&pkts_burst[nb_tx], nb_rx - nb_tx);
> +			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
>  		}
>  	}
> 
> @@ -126,11 +132,7 @@
>  			rte_pktmbuf_free(pkts_burst[nb_tx]);
>  		} while (++nb_tx < nb_rx);
>  	}
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	end_tsc = rte_rdtsc();
> -	core_cycles = (end_tsc - start_tsc);
> -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> -#endif
> +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
>  }
> 
>  struct fwd_engine mac_fwd_engine = {
> diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c index
> 71af916..b22acdb 100644
> --- a/app/test-pmd/macswap.c
> +++ b/app/test-pmd/macswap.c
> @@ -86,21 +86,22 @@
>  	uint16_t nb_rx;
>  	uint16_t nb_tx;
>  	uint32_t retry;
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	uint64_t start_tsc;
> -	uint64_t end_tsc;
> -	uint64_t core_cycles;
> -#endif
> 
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	start_tsc = rte_rdtsc();
> +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)

Should the RTE_TEST_PMD_RECORD_CORE_CYCLES macro be checked here too?

> +	uint64_t start_tx_tsc;
> +#endif
> +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> +	uint64_t start_rx_tsc;
>  #endif
> 
>  	/*
>  	 * Receive a burst of packets and forward them.
>  	 */
> +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
>  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
>  				 nb_pkt_per_burst);
> +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
>  	if (unlikely(nb_rx == 0))
>  		return;
> 
> @@ -112,7 +113,10 @@
> 
>  	do_macswap(pkts_burst, nb_rx, txp);
> 
> +	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
> +	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> +
>  	/*
>  	 * Retry if necessary
>  	 */
> @@ -120,8 +124,10 @@
>  		retry = 0;
>  		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
>  			rte_delay_us(burst_tx_delay_time);
> +			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
>  					&pkts_burst[nb_tx], nb_rx - nb_tx);
> +			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
>  		}
>  	}
>  	fs->tx_packets += nb_tx;
> @@ -134,11 +140,7 @@
>  			rte_pktmbuf_free(pkts_burst[nb_tx]);
>  		} while (++nb_tx < nb_rx);
>  	}
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	end_tsc = rte_rdtsc();
> -	core_cycles = (end_tsc - start_tsc);
> -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> -#endif
> +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
>  }
> 
>  struct fwd_engine mac_swap_engine = {
> diff --git a/app/test-pmd/rxonly.c b/app/test-pmd/rxonly.c index
> 5c65fc4..d1da357 100644
> --- a/app/test-pmd/rxonly.c
> +++ b/app/test-pmd/rxonly.c
> @@ -50,19 +50,18 @@
>  	uint16_t nb_rx;
>  	uint16_t i;
> 
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	uint64_t start_tsc;
> -	uint64_t end_tsc;
> -	uint64_t core_cycles;
> -
> -	start_tsc = rte_rdtsc();
> +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> +	uint64_t start_rx_tsc;
>  #endif
> 
>  	/*
>  	 * Receive a burst of packets.
>  	 */
> +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
>  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
>  				 nb_pkt_per_burst);
> +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
>  	if (unlikely(nb_rx == 0))
>  		return;
> 
> @@ -73,11 +72,7 @@
>  	for (i = 0; i < nb_rx; i++)
>  		rte_pktmbuf_free(pkts_burst[i]);
> 
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	end_tsc = rte_rdtsc();
> -	core_cycles = (end_tsc - start_tsc);
> -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> -#endif
> +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
>  }
> 
>  struct fwd_engine rx_only_engine = {
> diff --git a/app/test-pmd/softnicfwd.c b/app/test-pmd/softnicfwd.c index
> 94e6669..9b2b0e6 100644
> --- a/app/test-pmd/softnicfwd.c
> +++ b/app/test-pmd/softnicfwd.c
> @@ -87,35 +87,39 @@ struct tm_hierarchy {
>  	uint16_t nb_tx;
>  	uint32_t retry;
> 
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	uint64_t start_tsc;
> -	uint64_t end_tsc;
> -	uint64_t core_cycles;
> +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)

Should the RTE_TEST_PMD_RECORD_CORE_CYCLES macro be checked here too?

> +	uint64_t start_tx_tsc;
>  #endif
> -
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	start_tsc = rte_rdtsc();
> +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> +	uint64_t start_rx_tsc;
>  #endif
> 
>  	/*  Packets Receive */
> +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
>  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
>  			pkts_burst, nb_pkt_per_burst);
> +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
>  	fs->rx_packets += nb_rx;
> 
>  #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
>  	fs->rx_burst_stats.pkt_burst_spread[nb_rx]++;
>  #endif
> 
> +	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
>  			pkts_burst, nb_rx);
> +	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> 
>  	/* Retry if necessary */
>  	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
>  		retry = 0;
>  		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
>  			rte_delay_us(burst_tx_delay_time);
> +			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
>  					&pkts_burst[nb_tx], nb_rx - nb_tx);
> +			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
>  		}
>  	}
>  	fs->tx_packets += nb_tx;
> @@ -130,11 +134,7 @@ struct tm_hierarchy {
>  			rte_pktmbuf_free(pkts_burst[nb_tx]);
>  		} while (++nb_tx < nb_rx);
>  	}
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	end_tsc = rte_rdtsc();
> -	core_cycles = (end_tsc - start_tsc);
> -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> -#endif
> +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
>  }
> 
>  static void
> diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c index
> f0061d9..de8478f 100644
> --- a/app/test-pmd/testpmd.c
> +++ b/app/test-pmd/testpmd.c
> @@ -1483,6 +1483,12 @@ struct extmem_param {  #ifdef
> RTE_TEST_PMD_RECORD_CORE_CYCLES
>  	uint64_t fwd_cycles = 0;
>  #endif
> +#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
> +	uint64_t rx_cycles = 0;
> +#endif
> +#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
> +	uint64_t tx_cycles = 0;
> +#endif
>  	uint64_t total_recv = 0;
>  	uint64_t total_xmit = 0;
>  	struct rte_port *port;
> @@ -1513,6 +1519,12 @@ struct extmem_param {  #ifdef
> RTE_TEST_PMD_RECORD_CORE_CYCLES
>  		fwd_cycles += fs->core_cycles;
>  #endif
> +#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
> +		rx_cycles += fs->core_rx_cycles;
> +#endif
> +#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
> +		tx_cycles += fs->core_tx_cycles;
> +#endif
>  	}
>  	for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++) {
>  		uint8_t j;
> @@ -1648,6 +1660,20 @@ struct extmem_param {
>  		       (unsigned int)(fwd_cycles / total_recv),
>  		       fwd_cycles, total_recv);
>  #endif
> +#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
> +	if (total_recv > 0)
> +		printf("\n  rx CPU cycles/packet=%u (total cycles="
> +		       "%"PRIu64" / total RX packets=%"PRIu64")\n",
> +		       (unsigned int)(rx_cycles / total_recv),
> +		       rx_cycles, total_recv);
> +#endif
> +#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
> +	if (total_xmit > 0)
> +		printf("\n  tx CPU cycles/packet=%u (total cycles="
> +		       "%"PRIu64" / total TX packets=%"PRIu64")\n",
> +		       (unsigned int)(tx_cycles / total_xmit),
> +		       tx_cycles, total_xmit);
> +#endif
>  }
> 
>  void
> @@ -1678,6 +1704,12 @@ struct extmem_param {  #ifdef
> RTE_TEST_PMD_RECORD_CORE_CYCLES
>  		fs->core_cycles = 0;
>  #endif
> +#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
> +		fs->core_rx_cycles = 0;
> +#endif
> +#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
> +		fs->core_tx_cycles = 0;
> +#endif
>  	}
>  }
> 
> diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h index
> 1d9b7a2..4e8af8a 100644
> --- a/app/test-pmd/testpmd.h
> +++ b/app/test-pmd/testpmd.h
> @@ -130,12 +130,52 @@ struct fwd_stream {  #ifdef
> RTE_TEST_PMD_RECORD_CORE_CYCLES
>  	uint64_t     core_cycles; /**< used for RX and TX processing */
>  #endif
> +#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
> +	uint64_t     core_tx_cycles; /**< used for tx_burst processing */
> +#endif
> +#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
> +	uint64_t     core_rx_cycles; /**< used for rx_burst processing */
> +#endif
>  #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
>  	struct pkt_burst_stats rx_burst_stats;
>  	struct pkt_burst_stats tx_burst_stats;  #endif  };
> 
> +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
> +#define TEST_PMD_CORE_CYC_TX_START(a) {a = rte_rdtsc(); } #else #define
> +TEST_PMD_CORE_CYC_TX_START(a) #endif
> +
> +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> +#define TEST_PMD_CORE_CYC_RX_START(a) {a = rte_rdtsc(); } #else #define
> +TEST_PMD_CORE_CYC_RX_START(a) #endif
> +
> +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> +#define TEST_PMD_CORE_CYC_FWD_ADD(fs, s) \ {uint64_t end_tsc =
> +rte_rdtsc(); fs->core_cycles += end_tsc - (s); } #else #define
> +TEST_PMD_CORE_CYC_FWD_ADD(fs, s) #endif
> +
> +#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
> +#define TEST_PMD_CORE_CYC_TX_ADD(fs, s) \ {uint64_t end_tsc =
> +rte_rdtsc(); fs->core_tx_cycles += end_tsc - (s); } #else #define
> +TEST_PMD_CORE_CYC_TX_ADD(fs, s) #endif
> +
> +#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
> +#define TEST_PMD_CORE_CYC_RX_ADD(fs, s) \ {uint64_t end_tsc =
> +rte_rdtsc(); fs->core_rx_cycles += end_tsc - (s); } #else #define
> +TEST_PMD_CORE_CYC_RX_ADD(fs, s) #endif
> +
>  /** Descriptor for a single flow. */
>  struct port_flow {
>  	struct port_flow *next; /**< Next flow in list. */ diff --git a/app/test-
> pmd/txonly.c b/app/test-pmd/txonly.c index fdfca14..fe3045a 100644
> --- a/app/test-pmd/txonly.c
> +++ b/app/test-pmd/txonly.c
> @@ -241,16 +241,16 @@
>  	uint32_t retry;
>  	uint64_t ol_flags = 0;
>  	uint64_t tx_offloads;
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	uint64_t start_tsc;
> -	uint64_t end_tsc;
> -	uint64_t core_cycles;
> +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
> +	uint64_t start_tx_tsc;
> +#endif
> +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES)
> +	uint64_t start_rx_tsc;
>  #endif
> 
>  #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	start_tsc = rte_rdtsc();
> +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
>  #endif
> -
>  	mbp = current_fwd_lcore()->mbp;
>  	txp = &ports[fs->tx_port];
>  	tx_offloads = txp->dev_conf.txmode.offloads; @@ -302,7 +302,9 @@
>  	if (nb_pkt == 0)
>  		return;
> 
> +	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
> +	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
>  	/*
>  	 * Retry if necessary
>  	 */
> @@ -310,8 +312,10 @@
>  		retry = 0;
>  		while (nb_tx < nb_pkt && retry++ < burst_tx_retry_num) {
>  			rte_delay_us(burst_tx_delay_time);
> +			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
>  					&pkts_burst[nb_tx], nb_pkt - nb_tx);
> +			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
>  		}
>  	}
>  	fs->tx_packets += nb_tx;
> @@ -334,12 +338,7 @@
>  			rte_pktmbuf_free(pkts_burst[nb_tx]);
>  		} while (++nb_tx < nb_pkt);
>  	}
> -
> -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> -	end_tsc = rte_rdtsc();
> -	core_cycles = (end_tsc - start_tsc);
> -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> -#endif
> +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
>  }
> 
>  static void
> diff --git a/config/common_base b/config/common_base index
> 6b96e0e..6e84af4 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -998,6 +998,8 @@ CONFIG_RTE_PROC_INFO=n  #
> CONFIG_RTE_TEST_PMD=y
> CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=n
> +CONFIG_RTE_TEST_PMD_RECORD_CORE_RX_CYCLES=n
> +CONFIG_RTE_TEST_PMD_RECORD_CORE_TX_CYCLES=n
>  CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=n

Should the RECORD macros be documented in the run_app.rst file ?
 
>  #
> --
> 1.8.3.1

Regards,

Bernard
  
Slava Ovsiienko June 10, 2019, 4:39 a.m. UTC | #2
Hi, Bernard

Thanks for the comment.


> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	uint64_t start_tsc;
> > -	uint64_t end_tsc;
> > -	uint64_t core_cycles;
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
> > +	uint64_t start_tx_tsc;
> 
> Should the RTE_TEST_PMD_RECORD_CORE_CYCLES macro be checked here
> too?
>
I think - it should not. All of options:
RTE_TEST_PMD_RECORD_CORE_CYCLES
RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
are supposed to be defined independently.
I've compiled for all 8 possible CORE_xx_CYCLES combinations.

RTE_TEST_PMD_RECORD_CORE_TX_CYCLES uses the dedicated TSC start point "start_tx_tsc".
RTE_TEST_PMD_RECORD_CORE_CYCLES and RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
share the "start_rx_tsc".

With best regards,
Slava (Viacheslav)

> -----Original Message-----
> From: Iremonger, Bernard <bernard.iremonger@intel.com>
> Sent: Friday, June 7, 2019 19:08
> To: Slava Ovsiienko <viacheslavo@mellanox.com>; dev@dpdk.org
> Cc: Yigit, Ferruh <ferruh.yigit@intel.com>
> Subject: RE: [dpdk-dev] [RFC] app/testpmd: add profiling for Rx/Tx burst
> routines
> 
> Hi Viacheslav,
> 
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Viacheslav
> > Ovsiienko
> > Sent: Monday, May 27, 2019 6:47 AM
> > To: dev@dpdk.org
> > Cc: Yigit, Ferruh <ferruh.yigit@intel.com>
> > Subject: [dpdk-dev] [RFC] app/testpmd: add profiling for Rx/Tx burst
> > routines
> >
> > There is the testpmd configuration option called
> > RTE_TEST_PMD_RECORD_CORE_CYCLES, if this one is turned on the
> testpmd
> > application measures the CPU clocks spent within forwarding loop. This
> > time is the sum of execution times of rte_eth_rx_burst(),
> > rte_eth_tx_burst(), rte_delay_us(),
> >  rte_pktmbuf_free() and so on, depending on fwd mode set.
> >
> > While debugging and performance optimization of datapath burst
> > routines tt would be useful to see the pure execution times of these
> > ones. It is proposed to add separated profiling
> > options:
> >
> > CONFIG_RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
> >     enables gathering profiling data for transmit datapath,
> >     ticks spent within rte_eth_tx_burst()
> >
> > CONFIG_RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
> >     enables gathering profiling data for transmit datapath,
> >     ticks spent within rte_eth_rx_burst()
> >
> > Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> > ---
> >  app/test-pmd/csumonly.c   | 25 ++++++++++++-------------
> >  app/test-pmd/flowgen.c    | 25 +++++++++++++------------
> >  app/test-pmd/icmpecho.c   | 26 +++++++++++++-------------
> >  app/test-pmd/iofwd.c      | 24 ++++++++++++------------
> >  app/test-pmd/macfwd.c     | 24 +++++++++++++-----------
> >  app/test-pmd/macswap.c    | 26 ++++++++++++++------------
> >  app/test-pmd/rxonly.c     | 17 ++++++-----------
> >  app/test-pmd/softnicfwd.c | 24 ++++++++++++------------
> >  app/test-pmd/testpmd.c    | 32 ++++++++++++++++++++++++++++++++
> >  app/test-pmd/testpmd.h    | 40
> > ++++++++++++++++++++++++++++++++++++++++
> >  app/test-pmd/txonly.c     | 23 +++++++++++------------
> >  config/common_base        |  2 ++
> >  12 files changed, 180 insertions(+), 108 deletions(-)
> >
> > diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c index
> > f4f2a7b..251e179 100644
> > --- a/app/test-pmd/csumonly.c
> > +++ b/app/test-pmd/csumonly.c
> > @@ -710,19 +710,19 @@ struct simple_gre_hdr {
> >  	uint16_t nb_segments = 0;
> >  	int ret;
> >
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	uint64_t start_tsc;
> > -	uint64_t end_tsc;
> > -	uint64_t core_cycles;
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
> > +	uint64_t start_tx_tsc;
> 
> Should the RTE_TEST_PMD_RECORD_CORE_CYCLES macro be checked here
> too?
> 
> >  #endif
> > -
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	start_tsc = rte_rdtsc();
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> > +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> > +	uint64_t start_rx_tsc;
> >  #endif
> >
> >  	/* receive a burst of packet */
> > +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
> >  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
> >  				 nb_pkt_per_burst);
> > +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
> >  	if (unlikely(nb_rx == 0))
> >  		return;
> >  #ifdef RTE_TEST_PMD_RECORD_BURST_STATS @@ -982,8 +982,10 @@
> struct
> > simple_gre_hdr {
> >  		printf("Preparing packet burst to transmit failed: %s\n",
> >  				rte_strerror(rte_errno));
> >
> > +	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> >  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, tx_pkts_burst,
> >  			nb_prep);
> > +	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> >
> >  	/*
> >  	 * Retry if necessary
> > @@ -992,8 +994,10 @@ struct simple_gre_hdr {
> >  		retry = 0;
> >  		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
> >  			rte_delay_us(burst_tx_delay_time);
> > +			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> >  			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
> >  					&tx_pkts_burst[nb_tx], nb_rx -
> nb_tx);
> > +			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> >  		}
> >  	}
> >  	fs->tx_packets += nb_tx;
> > @@ -1010,12 +1014,7 @@ struct simple_gre_hdr {
> >  			rte_pktmbuf_free(tx_pkts_burst[nb_tx]);
> >  		} while (++nb_tx < nb_rx);
> >  	}
> > -
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	end_tsc = rte_rdtsc();
> > -	core_cycles = (end_tsc - start_tsc);
> > -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> > -#endif
> > +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
> >  }
> >
> >  struct fwd_engine csum_fwd_engine = { diff --git
> > a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c index
> > 3214e3c..b128e68 100644
> > --- a/app/test-pmd/flowgen.c
> > +++ b/app/test-pmd/flowgen.c
> > @@ -130,20 +130,21 @@
> >  	uint16_t i;
> >  	uint32_t retry;
> >  	uint64_t tx_offloads;
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	uint64_t start_tsc;
> > -	uint64_t end_tsc;
> > -	uint64_t core_cycles;
> > -#endif
> >  	static int next_flow = 0;
> >
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	start_tsc = rte_rdtsc();
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
> 
> Should the RTE_TEST_PMD_RECORD_CORE_CYCLES macro be checked here
> too?
> 
> > +	uint64_t start_tx_tsc;
> > +#endif
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> > +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> > +	uint64_t start_rx_tsc;
> >  #endif
> >
> >  	/* Receive a burst of packets and discard them. */
> > +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
> >  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
> >  				 nb_pkt_per_burst);
> > +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
> >  	fs->rx_packets += nb_rx;
> >
> >  	for (i = 0; i < nb_rx; i++)
> > @@ -212,7 +213,9 @@
> >  		next_flow = (next_flow + 1) % cfg_n_flows;
> >  	}
> >
> > +	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> >  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
> > nb_pkt);
> > +	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> >  	/*
> >  	 * Retry if necessary
> >  	 */
> > @@ -220,8 +223,10 @@
> >  		retry = 0;
> >  		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
> >  			rte_delay_us(burst_tx_delay_time);
> > +			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> >  			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
> >  					&pkts_burst[nb_tx], nb_rx - nb_tx);
> > +			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> >  		}
> >  	}
> >  	fs->tx_packets += nb_tx;
> > @@ -239,11 +244,7 @@
> >  			rte_pktmbuf_free(pkts_burst[nb_tx]);
> >  		} while (++nb_tx < nb_pkt);
> >  	}
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	end_tsc = rte_rdtsc();
> > -	core_cycles = (end_tsc - start_tsc);
> > -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> > -#endif
> > +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
> >  }
> >
> >  struct fwd_engine flow_gen_engine = { diff --git
> > a/app/test-pmd/icmpecho.c b/app/test-pmd/icmpecho.c index
> > 55d266d..a539fe8 100644
> > --- a/app/test-pmd/icmpecho.c
> > +++ b/app/test-pmd/icmpecho.c
> > @@ -293,21 +293,22 @@
> >  	uint32_t cksum;
> >  	uint8_t  i;
> >  	int l2_len;
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	uint64_t start_tsc;
> > -	uint64_t end_tsc;
> > -	uint64_t core_cycles;
> > -#endif
> >
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	start_tsc = rte_rdtsc();
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
> > +	uint64_t start_tx_tsc;
> > +#endif
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> > +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> > +	uint64_t start_rx_tsc;
> >  #endif
> >
> >  	/*
> >  	 * First, receive a burst of packets.
> >  	 */
> > +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
> >  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
> >  				 nb_pkt_per_burst);
> > +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
> >  	if (unlikely(nb_rx == 0))
> >  		return;
> >
> > @@ -487,8 +488,10 @@
> >
> >  	/* Send back ICMP echo replies, if any. */
> >  	if (nb_replies > 0) {
> > +		TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> >  		nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
> pkts_burst,
> >  					 nb_replies);
> > +		TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> >  		/*
> >  		 * Retry if necessary
> >  		 */
> > @@ -497,10 +500,12 @@
> >  			while (nb_tx < nb_replies &&
> >  					retry++ < burst_tx_retry_num) {
> >  				rte_delay_us(burst_tx_delay_time);
> > +
> > 	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> >  				nb_tx += rte_eth_tx_burst(fs->tx_port,
> >  						fs->tx_queue,
> >  						&pkts_burst[nb_tx],
> >  						nb_replies - nb_tx);
> > +				TEST_PMD_CORE_CYC_TX_ADD(fs,
> > start_tx_tsc);
> >  			}
> >  		}
> >  		fs->tx_packets += nb_tx;
> > @@ -514,12 +519,7 @@
> >  			} while (++nb_tx < nb_replies);
> >  		}
> >  	}
> > -
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	end_tsc = rte_rdtsc();
> > -	core_cycles = (end_tsc - start_tsc);
> > -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> > -#endif
> > +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
> >  }
> >
> >  struct fwd_engine icmp_echo_engine = { diff --git
> > a/app/test-pmd/iofwd.c b/app/test-pmd/iofwd.c index
> > 9dce76e..dc66a88 100644
> > --- a/app/test-pmd/iofwd.c
> > +++ b/app/test-pmd/iofwd.c
> > @@ -51,21 +51,21 @@
> >  	uint16_t nb_tx;
> >  	uint32_t retry;
> >
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	uint64_t start_tsc;
> > -	uint64_t end_tsc;
> > -	uint64_t core_cycles;
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
> 
> Should the RTE_TEST_PMD_RECORD_CORE_CYCLES macro be checked here
> too?
> 
> > +	uint64_t start_tx_tsc;
> >  #endif
> > -
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	start_tsc = rte_rdtsc();
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> > +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> > +	uint64_t start_rx_tsc;
> >  #endif
> >
> >  	/*
> >  	 * Receive a burst of packets and forward them.
> >  	 */
> > +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
> >  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
> >  			pkts_burst, nb_pkt_per_burst);
> > +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
> >  	if (unlikely(nb_rx == 0))
> >  		return;
> >  	fs->rx_packets += nb_rx;
> > @@ -73,8 +73,10 @@
> >  #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
> >  	fs->rx_burst_stats.pkt_burst_spread[nb_rx]++;
> >  #endif
> > +	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> >  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
> >  			pkts_burst, nb_rx);
> > +	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> >  	/*
> >  	 * Retry if necessary
> >  	 */
> > @@ -82,8 +84,10 @@
> >  		retry = 0;
> >  		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
> >  			rte_delay_us(burst_tx_delay_time);
> > +			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> >  			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
> >  					&pkts_burst[nb_tx], nb_rx - nb_tx);
> > +			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> >  		}
> >  	}
> >  	fs->tx_packets += nb_tx;
> > @@ -96,11 +100,7 @@
> >  			rte_pktmbuf_free(pkts_burst[nb_tx]);
> >  		} while (++nb_tx < nb_rx);
> >  	}
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	end_tsc = rte_rdtsc();
> > -	core_cycles = (end_tsc - start_tsc);
> > -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> > -#endif
> > +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
> >  }
> >
> >  struct fwd_engine io_fwd_engine = {
> > diff --git a/app/test-pmd/macfwd.c b/app/test-pmd/macfwd.c index
> > 7cac757..2fd38ea 100644
> > --- a/app/test-pmd/macfwd.c
> > +++ b/app/test-pmd/macfwd.c
> > @@ -56,21 +56,23 @@
> >  	uint16_t i;
> >  	uint64_t ol_flags = 0;
> >  	uint64_t tx_offloads;
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	uint64_t start_tsc;
> > -	uint64_t end_tsc;
> > -	uint64_t core_cycles;
> > +
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
> 
> Should the RTE_TEST_PMD_RECORD_CORE_CYCLES macro be checked here
> too?
> 
> > +	uint64_t start_tx_tsc;
> >  #endif
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> > +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> > +	uint64_t start_rx_tsc;
> >
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	start_tsc = rte_rdtsc();
> >  #endif
> >
> >  	/*
> >  	 * Receive a burst of packets and forward them.
> >  	 */
> > +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
> >  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
> >  				 nb_pkt_per_burst);
> > +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
> >  	if (unlikely(nb_rx == 0))
> >  		return;
> >
> > @@ -103,7 +105,9 @@
> >  		mb->vlan_tci = txp->tx_vlan_id;
> >  		mb->vlan_tci_outer = txp->tx_vlan_id_outer;
> >  	}
> > +	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> >  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
> > nb_rx);
> > +	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> >  	/*
> >  	 * Retry if necessary
> >  	 */
> > @@ -111,8 +115,10 @@
> >  		retry = 0;
> >  		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
> >  			rte_delay_us(burst_tx_delay_time);
> > +			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> >  			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
> >  					&pkts_burst[nb_tx], nb_rx - nb_tx);
> > +			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> >  		}
> >  	}
> >
> > @@ -126,11 +132,7 @@
> >  			rte_pktmbuf_free(pkts_burst[nb_tx]);
> >  		} while (++nb_tx < nb_rx);
> >  	}
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	end_tsc = rte_rdtsc();
> > -	core_cycles = (end_tsc - start_tsc);
> > -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> > -#endif
> > +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
> >  }
> >
> >  struct fwd_engine mac_fwd_engine = {
> > diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c index
> > 71af916..b22acdb 100644
> > --- a/app/test-pmd/macswap.c
> > +++ b/app/test-pmd/macswap.c
> > @@ -86,21 +86,22 @@
> >  	uint16_t nb_rx;
> >  	uint16_t nb_tx;
> >  	uint32_t retry;
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	uint64_t start_tsc;
> > -	uint64_t end_tsc;
> > -	uint64_t core_cycles;
> > -#endif
> >
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	start_tsc = rte_rdtsc();
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
> 
> Should the RTE_TEST_PMD_RECORD_CORE_CYCLES macro be checked here
> too?
> 
> > +	uint64_t start_tx_tsc;
> > +#endif
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> > +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> > +	uint64_t start_rx_tsc;
> >  #endif
> >
> >  	/*
> >  	 * Receive a burst of packets and forward them.
> >  	 */
> > +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
> >  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
> >  				 nb_pkt_per_burst);
> > +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
> >  	if (unlikely(nb_rx == 0))
> >  		return;
> >
> > @@ -112,7 +113,10 @@
> >
> >  	do_macswap(pkts_burst, nb_rx, txp);
> >
> > +	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> >  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
> > nb_rx);
> > +	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> > +
> >  	/*
> >  	 * Retry if necessary
> >  	 */
> > @@ -120,8 +124,10 @@
> >  		retry = 0;
> >  		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
> >  			rte_delay_us(burst_tx_delay_time);
> > +			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> >  			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
> >  					&pkts_burst[nb_tx], nb_rx - nb_tx);
> > +			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> >  		}
> >  	}
> >  	fs->tx_packets += nb_tx;
> > @@ -134,11 +140,7 @@
> >  			rte_pktmbuf_free(pkts_burst[nb_tx]);
> >  		} while (++nb_tx < nb_rx);
> >  	}
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	end_tsc = rte_rdtsc();
> > -	core_cycles = (end_tsc - start_tsc);
> > -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> > -#endif
> > +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
> >  }
> >
> >  struct fwd_engine mac_swap_engine = { diff --git
> > a/app/test-pmd/rxonly.c b/app/test-pmd/rxonly.c index
> > 5c65fc4..d1da357 100644
> > --- a/app/test-pmd/rxonly.c
> > +++ b/app/test-pmd/rxonly.c
> > @@ -50,19 +50,18 @@
> >  	uint16_t nb_rx;
> >  	uint16_t i;
> >
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	uint64_t start_tsc;
> > -	uint64_t end_tsc;
> > -	uint64_t core_cycles;
> > -
> > -	start_tsc = rte_rdtsc();
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> > +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> > +	uint64_t start_rx_tsc;
> >  #endif
> >
> >  	/*
> >  	 * Receive a burst of packets.
> >  	 */
> > +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
> >  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
> >  				 nb_pkt_per_burst);
> > +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
> >  	if (unlikely(nb_rx == 0))
> >  		return;
> >
> > @@ -73,11 +72,7 @@
> >  	for (i = 0; i < nb_rx; i++)
> >  		rte_pktmbuf_free(pkts_burst[i]);
> >
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	end_tsc = rte_rdtsc();
> > -	core_cycles = (end_tsc - start_tsc);
> > -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> > -#endif
> > +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
> >  }
> >
> >  struct fwd_engine rx_only_engine = {
> > diff --git a/app/test-pmd/softnicfwd.c b/app/test-pmd/softnicfwd.c
> > index
> > 94e6669..9b2b0e6 100644
> > --- a/app/test-pmd/softnicfwd.c
> > +++ b/app/test-pmd/softnicfwd.c
> > @@ -87,35 +87,39 @@ struct tm_hierarchy {
> >  	uint16_t nb_tx;
> >  	uint32_t retry;
> >
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	uint64_t start_tsc;
> > -	uint64_t end_tsc;
> > -	uint64_t core_cycles;
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
> 
> Should the RTE_TEST_PMD_RECORD_CORE_CYCLES macro be checked here
> too?
> 
> > +	uint64_t start_tx_tsc;
> >  #endif
> > -
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	start_tsc = rte_rdtsc();
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> > +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> > +	uint64_t start_rx_tsc;
> >  #endif
> >
> >  	/*  Packets Receive */
> > +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
> >  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
> >  			pkts_burst, nb_pkt_per_burst);
> > +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
> >  	fs->rx_packets += nb_rx;
> >
> >  #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
> >  	fs->rx_burst_stats.pkt_burst_spread[nb_rx]++;
> >  #endif
> >
> > +	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> >  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
> >  			pkts_burst, nb_rx);
> > +	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> >
> >  	/* Retry if necessary */
> >  	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
> >  		retry = 0;
> >  		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
> >  			rte_delay_us(burst_tx_delay_time);
> > +			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> >  			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
> >  					&pkts_burst[nb_tx], nb_rx - nb_tx);
> > +			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> >  		}
> >  	}
> >  	fs->tx_packets += nb_tx;
> > @@ -130,11 +134,7 @@ struct tm_hierarchy {
> >  			rte_pktmbuf_free(pkts_burst[nb_tx]);
> >  		} while (++nb_tx < nb_rx);
> >  	}
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	end_tsc = rte_rdtsc();
> > -	core_cycles = (end_tsc - start_tsc);
> > -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> > -#endif
> > +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
> >  }
> >
> >  static void
> > diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c index
> > f0061d9..de8478f 100644
> > --- a/app/test-pmd/testpmd.c
> > +++ b/app/test-pmd/testpmd.c
> > @@ -1483,6 +1483,12 @@ struct extmem_param {  #ifdef
> > RTE_TEST_PMD_RECORD_CORE_CYCLES
> >  	uint64_t fwd_cycles = 0;
> >  #endif
> > +#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
> > +	uint64_t rx_cycles = 0;
> > +#endif
> > +#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
> > +	uint64_t tx_cycles = 0;
> > +#endif
> >  	uint64_t total_recv = 0;
> >  	uint64_t total_xmit = 0;
> >  	struct rte_port *port;
> > @@ -1513,6 +1519,12 @@ struct extmem_param {  #ifdef
> > RTE_TEST_PMD_RECORD_CORE_CYCLES
> >  		fwd_cycles += fs->core_cycles;
> >  #endif
> > +#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
> > +		rx_cycles += fs->core_rx_cycles;
> > +#endif
> > +#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
> > +		tx_cycles += fs->core_tx_cycles;
> > +#endif
> >  	}
> >  	for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++) {
> >  		uint8_t j;
> > @@ -1648,6 +1660,20 @@ struct extmem_param {
> >  		       (unsigned int)(fwd_cycles / total_recv),
> >  		       fwd_cycles, total_recv);
> >  #endif
> > +#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
> > +	if (total_recv > 0)
> > +		printf("\n  rx CPU cycles/packet=%u (total cycles="
> > +		       "%"PRIu64" / total RX packets=%"PRIu64")\n",
> > +		       (unsigned int)(rx_cycles / total_recv),
> > +		       rx_cycles, total_recv);
> > +#endif
> > +#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
> > +	if (total_xmit > 0)
> > +		printf("\n  tx CPU cycles/packet=%u (total cycles="
> > +		       "%"PRIu64" / total TX packets=%"PRIu64")\n",
> > +		       (unsigned int)(tx_cycles / total_xmit),
> > +		       tx_cycles, total_xmit);
> > +#endif
> >  }
> >
> >  void
> > @@ -1678,6 +1704,12 @@ struct extmem_param {  #ifdef
> > RTE_TEST_PMD_RECORD_CORE_CYCLES
> >  		fs->core_cycles = 0;
> >  #endif
> > +#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
> > +		fs->core_rx_cycles = 0;
> > +#endif
> > +#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
> > +		fs->core_tx_cycles = 0;
> > +#endif
> >  	}
> >  }
> >
> > diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h index
> > 1d9b7a2..4e8af8a 100644
> > --- a/app/test-pmd/testpmd.h
> > +++ b/app/test-pmd/testpmd.h
> > @@ -130,12 +130,52 @@ struct fwd_stream {  #ifdef
> > RTE_TEST_PMD_RECORD_CORE_CYCLES
> >  	uint64_t     core_cycles; /**< used for RX and TX processing */
> >  #endif
> > +#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
> > +	uint64_t     core_tx_cycles; /**< used for tx_burst processing */
> > +#endif
> > +#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
> > +	uint64_t     core_rx_cycles; /**< used for rx_burst processing */
> > +#endif
> >  #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
> >  	struct pkt_burst_stats rx_burst_stats;
> >  	struct pkt_burst_stats tx_burst_stats;  #endif  };
> >
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
> > +#define TEST_PMD_CORE_CYC_TX_START(a) {a = rte_rdtsc(); } #else
> > +#define
> > +TEST_PMD_CORE_CYC_TX_START(a) #endif
> > +
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
> > +	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
> > +#define TEST_PMD_CORE_CYC_RX_START(a) {a = rte_rdtsc(); } #else
> > +#define
> > +TEST_PMD_CORE_CYC_RX_START(a) #endif
> > +
> > +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES #define
> > +TEST_PMD_CORE_CYC_FWD_ADD(fs, s) \ {uint64_t end_tsc = rte_rdtsc();
> > +fs->core_cycles += end_tsc - (s); } #else #define
> > +TEST_PMD_CORE_CYC_FWD_ADD(fs, s) #endif
> > +
> > +#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
> > +#define TEST_PMD_CORE_CYC_TX_ADD(fs, s) \ {uint64_t end_tsc =
> > +rte_rdtsc(); fs->core_tx_cycles += end_tsc - (s); } #else #define
> > +TEST_PMD_CORE_CYC_TX_ADD(fs, s) #endif
> > +
> > +#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
> > +#define TEST_PMD_CORE_CYC_RX_ADD(fs, s) \ {uint64_t end_tsc =
> > +rte_rdtsc(); fs->core_rx_cycles += end_tsc - (s); } #else #define
> > +TEST_PMD_CORE_CYC_RX_ADD(fs, s) #endif
> > +
> >  /** Descriptor for a single flow. */
> >  struct port_flow {
> >  	struct port_flow *next; /**< Next flow in list. */ diff --git
> > a/app/test- pmd/txonly.c b/app/test-pmd/txonly.c index
> > fdfca14..fe3045a 100644
> > --- a/app/test-pmd/txonly.c
> > +++ b/app/test-pmd/txonly.c
> > @@ -241,16 +241,16 @@
> >  	uint32_t retry;
> >  	uint64_t ol_flags = 0;
> >  	uint64_t tx_offloads;
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	uint64_t start_tsc;
> > -	uint64_t end_tsc;
> > -	uint64_t core_cycles;
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
> > +	uint64_t start_tx_tsc;
> > +#endif
> > +#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES)
> > +	uint64_t start_rx_tsc;
> >  #endif
> >
> >  #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	start_tsc = rte_rdtsc();
> > +	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
> >  #endif
> > -
> >  	mbp = current_fwd_lcore()->mbp;
> >  	txp = &ports[fs->tx_port];
> >  	tx_offloads = txp->dev_conf.txmode.offloads; @@ -302,7 +302,9
> @@
> >  	if (nb_pkt == 0)
> >  		return;
> >
> > +	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> >  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
> > nb_pkt);
> > +	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> >  	/*
> >  	 * Retry if necessary
> >  	 */
> > @@ -310,8 +312,10 @@
> >  		retry = 0;
> >  		while (nb_tx < nb_pkt && retry++ < burst_tx_retry_num) {
> >  			rte_delay_us(burst_tx_delay_time);
> > +			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> >  			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
> >  					&pkts_burst[nb_tx], nb_pkt - nb_tx);
> > +			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> >  		}
> >  	}
> >  	fs->tx_packets += nb_tx;
> > @@ -334,12 +338,7 @@
> >  			rte_pktmbuf_free(pkts_burst[nb_tx]);
> >  		} while (++nb_tx < nb_pkt);
> >  	}
> > -
> > -#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> > -	end_tsc = rte_rdtsc();
> > -	core_cycles = (end_tsc - start_tsc);
> > -	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
> > -#endif
> > +	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
> >  }
> >
> >  static void
> > diff --git a/config/common_base b/config/common_base index
> > 6b96e0e..6e84af4 100644
> > --- a/config/common_base
> > +++ b/config/common_base
> > @@ -998,6 +998,8 @@ CONFIG_RTE_PROC_INFO=n  #
> CONFIG_RTE_TEST_PMD=y
> > CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=n
> > +CONFIG_RTE_TEST_PMD_RECORD_CORE_RX_CYCLES=n
> > +CONFIG_RTE_TEST_PMD_RECORD_CORE_TX_CYCLES=n
> >  CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=n
> 
> Should the RECORD macros be documented in the run_app.rst file ?
> 
> >  #
> > --
> > 1.8.3.1
> 
> Regards,
> 
> Bernard
  
Iremonger, Bernard June 20, 2019, 3:15 p.m. UTC | #3
Hi  Slava,


> -----Original Message-----
> From: Slava Ovsiienko [mailto:viacheslavo@mellanox.com]
> Sent: Monday, June 10, 2019 5:40 AM
> To: Iremonger, Bernard <bernard.iremonger@intel.com>; dev@dpdk.org
> Cc: Yigit, Ferruh <ferruh.yigit@intel.com>
> Subject: RE: [dpdk-dev] [RFC] app/testpmd: add profiling for Rx/Tx burst
> routines
> 
<snip>

> > >  static void
> > > diff --git a/config/common_base b/config/common_base index
> > > 6b96e0e..6e84af4 100644
> > > --- a/config/common_base
> > > +++ b/config/common_base
> > > @@ -998,6 +998,8 @@ CONFIG_RTE_PROC_INFO=n  #
> > CONFIG_RTE_TEST_PMD=y
> > > CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=n
> > > +CONFIG_RTE_TEST_PMD_RECORD_CORE_RX_CYCLES=n
> > > +CONFIG_RTE_TEST_PMD_RECORD_CORE_TX_CYCLES=n
> > >  CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=n
> >
> > Should the RECORD macros be documented in the run_app.rst file ?

You missed the above comment in your reply.
There seems to be no documentation on the RECORD macros at present, there probably should be some.

Regards,

Bernard.
  
Slava Ovsiienko June 24, 2019, 4:03 a.m. UTC | #4
Hi, Bernard.

PSB.

> -----Original Message-----
> From: Iremonger, Bernard <bernard.iremonger@intel.com>
> Sent: Thursday, June 20, 2019 18:15
> To: Slava Ovsiienko <viacheslavo@mellanox.com>; dev@dpdk.org
> Cc: Yigit, Ferruh <ferruh.yigit@intel.com>
> Subject: RE: [dpdk-dev] [RFC] app/testpmd: add profiling for Rx/Tx burst
> routines
> 
> Hi  Slava,
> 
> 
> > -----Original Message-----
> > From: Slava Ovsiienko [mailto:viacheslavo@mellanox.com]
> > Sent: Monday, June 10, 2019 5:40 AM
> > To: Iremonger, Bernard <bernard.iremonger@intel.com>; dev@dpdk.org
> > Cc: Yigit, Ferruh <ferruh.yigit@intel.com>
> > Subject: RE: [dpdk-dev] [RFC] app/testpmd: add profiling for Rx/Tx
> > burst routines
> >
> <snip>
> 
> > > >  static void
> > > > diff --git a/config/common_base b/config/common_base index
> > > > 6b96e0e..6e84af4 100644
> > > > --- a/config/common_base
> > > > +++ b/config/common_base
> > > > @@ -998,6 +998,8 @@ CONFIG_RTE_PROC_INFO=n  #
> > > CONFIG_RTE_TEST_PMD=y
> > > > CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=n
> > > > +CONFIG_RTE_TEST_PMD_RECORD_CORE_RX_CYCLES=n
> > > > +CONFIG_RTE_TEST_PMD_RECORD_CORE_TX_CYCLES=n
> > > >  CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=n
> > >
> > > Should the RECORD macros be documented in the run_app.rst file ?
> 
> You missed the above comment in your reply.

Yes, I missed the comment, sorry for this.
Thank you for gentle reminding, I will add documentation part to the patch.

> There seems to be no documentation on the RECORD macros at present,
> there probably should be some.
> 
> Regards,
> 
> Bernard.
> 
With best regards,
Slava
  

Patch

diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index f4f2a7b..251e179 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -710,19 +710,19 @@  struct simple_gre_hdr {
 	uint16_t nb_segments = 0;
 	int ret;
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+	uint64_t start_tx_tsc;
 #endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+	uint64_t start_rx_tsc;
 #endif
 
 	/* receive a burst of packet */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
@@ -982,8 +982,10 @@  struct simple_gre_hdr {
 		printf("Preparing packet burst to transmit failed: %s\n",
 				rte_strerror(rte_errno));
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, tx_pkts_burst,
 			nb_prep);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 
 	/*
 	 * Retry if necessary
@@ -992,8 +994,10 @@  struct simple_gre_hdr {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&tx_pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -1010,12 +1014,7 @@  struct simple_gre_hdr {
 			rte_pktmbuf_free(tx_pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine csum_fwd_engine = {
diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 3214e3c..b128e68 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -130,20 +130,21 @@ 
 	uint16_t i;
 	uint32_t retry;
 	uint64_t tx_offloads;
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
 	static int next_flow = 0;
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+	uint64_t start_tx_tsc;
+#endif
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+	uint64_t start_rx_tsc;
 #endif
 
 	/* Receive a burst of packets and discard them. */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	fs->rx_packets += nb_rx;
 
 	for (i = 0; i < nb_rx; i++)
@@ -212,7 +213,9 @@ 
 		next_flow = (next_flow + 1) % cfg_n_flows;
 	}
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -220,8 +223,10 @@ 
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -239,11 +244,7 @@ 
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_pkt);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine flow_gen_engine = {
diff --git a/app/test-pmd/icmpecho.c b/app/test-pmd/icmpecho.c
index 55d266d..a539fe8 100644
--- a/app/test-pmd/icmpecho.c
+++ b/app/test-pmd/icmpecho.c
@@ -293,21 +293,22 @@ 
 	uint32_t cksum;
 	uint8_t  i;
 	int l2_len;
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+	uint64_t start_tx_tsc;
+#endif
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+	uint64_t start_rx_tsc;
 #endif
 
 	/*
 	 * First, receive a burst of packets.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -487,8 +488,10 @@ 
 
 	/* Send back ICMP echo replies, if any. */
 	if (nb_replies > 0) {
+		TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 		nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
 					 nb_replies);
+		TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		/*
 		 * Retry if necessary
 		 */
@@ -497,10 +500,12 @@ 
 			while (nb_tx < nb_replies &&
 					retry++ < burst_tx_retry_num) {
 				rte_delay_us(burst_tx_delay_time);
+				TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 				nb_tx += rte_eth_tx_burst(fs->tx_port,
 						fs->tx_queue,
 						&pkts_burst[nb_tx],
 						nb_replies - nb_tx);
+				TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 			}
 		}
 		fs->tx_packets += nb_tx;
@@ -514,12 +519,7 @@ 
 			} while (++nb_tx < nb_replies);
 		}
 	}
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine icmp_echo_engine = {
diff --git a/app/test-pmd/iofwd.c b/app/test-pmd/iofwd.c
index 9dce76e..dc66a88 100644
--- a/app/test-pmd/iofwd.c
+++ b/app/test-pmd/iofwd.c
@@ -51,21 +51,21 @@ 
 	uint16_t nb_tx;
 	uint32_t retry;
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+	uint64_t start_tx_tsc;
 #endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+	uint64_t start_rx_tsc;
 #endif
 
 	/*
 	 * Receive a burst of packets and forward them.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
 			pkts_burst, nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 	fs->rx_packets += nb_rx;
@@ -73,8 +73,10 @@ 
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
 	fs->rx_burst_stats.pkt_burst_spread[nb_rx]++;
 #endif
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 			pkts_burst, nb_rx);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -82,8 +84,10 @@ 
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -96,11 +100,7 @@ 
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine io_fwd_engine = {
diff --git a/app/test-pmd/macfwd.c b/app/test-pmd/macfwd.c
index 7cac757..2fd38ea 100644
--- a/app/test-pmd/macfwd.c
+++ b/app/test-pmd/macfwd.c
@@ -56,21 +56,23 @@ 
 	uint16_t i;
 	uint64_t ol_flags = 0;
 	uint64_t tx_offloads;
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
+
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+	uint64_t start_tx_tsc;
 #endif
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+	uint64_t start_rx_tsc;
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
 #endif
 
 	/*
 	 * Receive a burst of packets and forward them.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -103,7 +105,9 @@ 
 		mb->vlan_tci = txp->tx_vlan_id;
 		mb->vlan_tci_outer = txp->tx_vlan_id_outer;
 	}
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -111,8 +115,10 @@ 
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 
@@ -126,11 +132,7 @@ 
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine mac_fwd_engine = {
diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index 71af916..b22acdb 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -86,21 +86,22 @@ 
 	uint16_t nb_rx;
 	uint16_t nb_tx;
 	uint32_t retry;
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+	uint64_t start_tx_tsc;
+#endif
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+	uint64_t start_rx_tsc;
 #endif
 
 	/*
 	 * Receive a burst of packets and forward them.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -112,7 +113,10 @@ 
 
 	do_macswap(pkts_burst, nb_rx, txp);
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+
 	/*
 	 * Retry if necessary
 	 */
@@ -120,8 +124,10 @@ 
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -134,11 +140,7 @@ 
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine mac_swap_engine = {
diff --git a/app/test-pmd/rxonly.c b/app/test-pmd/rxonly.c
index 5c65fc4..d1da357 100644
--- a/app/test-pmd/rxonly.c
+++ b/app/test-pmd/rxonly.c
@@ -50,19 +50,18 @@ 
 	uint16_t nb_rx;
 	uint16_t i;
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-
-	start_tsc = rte_rdtsc();
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+	uint64_t start_rx_tsc;
 #endif
 
 	/*
 	 * Receive a burst of packets.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -73,11 +72,7 @@ 
 	for (i = 0; i < nb_rx; i++)
 		rte_pktmbuf_free(pkts_burst[i]);
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine rx_only_engine = {
diff --git a/app/test-pmd/softnicfwd.c b/app/test-pmd/softnicfwd.c
index 94e6669..9b2b0e6 100644
--- a/app/test-pmd/softnicfwd.c
+++ b/app/test-pmd/softnicfwd.c
@@ -87,35 +87,39 @@  struct tm_hierarchy {
 	uint16_t nb_tx;
 	uint32_t retry;
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+	uint64_t start_tx_tsc;
 #endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+	uint64_t start_rx_tsc;
 #endif
 
 	/*  Packets Receive */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
 			pkts_burst, nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	fs->rx_packets += nb_rx;
 
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
 	fs->rx_burst_stats.pkt_burst_spread[nb_rx]++;
 #endif
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 			pkts_burst, nb_rx);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 
 	/* Retry if necessary */
 	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -130,11 +134,7 @@  struct tm_hierarchy {
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 static void
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index f0061d9..de8478f 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1483,6 +1483,12 @@  struct extmem_param {
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 	uint64_t fwd_cycles = 0;
 #endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
+	uint64_t rx_cycles = 0;
+#endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
+	uint64_t tx_cycles = 0;
+#endif
 	uint64_t total_recv = 0;
 	uint64_t total_xmit = 0;
 	struct rte_port *port;
@@ -1513,6 +1519,12 @@  struct extmem_param {
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 		fwd_cycles += fs->core_cycles;
 #endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
+		rx_cycles += fs->core_rx_cycles;
+#endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
+		tx_cycles += fs->core_tx_cycles;
+#endif
 	}
 	for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++) {
 		uint8_t j;
@@ -1648,6 +1660,20 @@  struct extmem_param {
 		       (unsigned int)(fwd_cycles / total_recv),
 		       fwd_cycles, total_recv);
 #endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
+	if (total_recv > 0)
+		printf("\n  rx CPU cycles/packet=%u (total cycles="
+		       "%"PRIu64" / total RX packets=%"PRIu64")\n",
+		       (unsigned int)(rx_cycles / total_recv),
+		       rx_cycles, total_recv);
+#endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
+	if (total_xmit > 0)
+		printf("\n  tx CPU cycles/packet=%u (total cycles="
+		       "%"PRIu64" / total TX packets=%"PRIu64")\n",
+		       (unsigned int)(tx_cycles / total_xmit),
+		       tx_cycles, total_xmit);
+#endif
 }
 
 void
@@ -1678,6 +1704,12 @@  struct extmem_param {
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 		fs->core_cycles = 0;
 #endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
+		fs->core_rx_cycles = 0;
+#endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
+		fs->core_tx_cycles = 0;
+#endif
 	}
 }
 
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 1d9b7a2..4e8af8a 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -130,12 +130,52 @@  struct fwd_stream {
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 	uint64_t     core_cycles; /**< used for RX and TX processing */
 #endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
+	uint64_t     core_tx_cycles; /**< used for tx_burst processing */
+#endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
+	uint64_t     core_rx_cycles; /**< used for rx_burst processing */
+#endif
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
 	struct pkt_burst_stats rx_burst_stats;
 	struct pkt_burst_stats tx_burst_stats;
 #endif
 };
 
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+#define TEST_PMD_CORE_CYC_TX_START(a) {a = rte_rdtsc(); }
+#else
+#define TEST_PMD_CORE_CYC_TX_START(a)
+#endif
+
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+#define TEST_PMD_CORE_CYC_RX_START(a) {a = rte_rdtsc(); }
+#else
+#define TEST_PMD_CORE_CYC_RX_START(a)
+#endif
+
+#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
+#define TEST_PMD_CORE_CYC_FWD_ADD(fs, s) \
+{uint64_t end_tsc = rte_rdtsc(); fs->core_cycles += end_tsc - (s); }
+#else
+#define TEST_PMD_CORE_CYC_FWD_ADD(fs, s)
+#endif
+
+#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
+#define TEST_PMD_CORE_CYC_TX_ADD(fs, s) \
+{uint64_t end_tsc = rte_rdtsc(); fs->core_tx_cycles += end_tsc - (s); }
+#else
+#define TEST_PMD_CORE_CYC_TX_ADD(fs, s)
+#endif
+
+#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
+#define TEST_PMD_CORE_CYC_RX_ADD(fs, s) \
+{uint64_t end_tsc = rte_rdtsc(); fs->core_rx_cycles += end_tsc - (s); }
+#else
+#define TEST_PMD_CORE_CYC_RX_ADD(fs, s)
+#endif
+
 /** Descriptor for a single flow. */
 struct port_flow {
 	struct port_flow *next; /**< Next flow in list. */
diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c
index fdfca14..fe3045a 100644
--- a/app/test-pmd/txonly.c
+++ b/app/test-pmd/txonly.c
@@ -241,16 +241,16 @@ 
 	uint32_t retry;
 	uint64_t ol_flags = 0;
 	uint64_t tx_offloads;
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+	uint64_t start_tx_tsc;
+#endif
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES)
+	uint64_t start_rx_tsc;
 #endif
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 #endif
-
 	mbp = current_fwd_lcore()->mbp;
 	txp = &ports[fs->tx_port];
 	tx_offloads = txp->dev_conf.txmode.offloads;
@@ -302,7 +302,9 @@ 
 	if (nb_pkt == 0)
 		return;
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -310,8 +312,10 @@ 
 		retry = 0;
 		while (nb_tx < nb_pkt && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_pkt - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -334,12 +338,7 @@ 
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_pkt);
 	}
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 static void
diff --git a/config/common_base b/config/common_base
index 6b96e0e..6e84af4 100644
--- a/config/common_base
+++ b/config/common_base
@@ -998,6 +998,8 @@  CONFIG_RTE_PROC_INFO=n
 #
 CONFIG_RTE_TEST_PMD=y
 CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=n
+CONFIG_RTE_TEST_PMD_RECORD_CORE_RX_CYCLES=n
+CONFIG_RTE_TEST_PMD_RECORD_CORE_TX_CYCLES=n
 CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=n
 
 #