[v1] examples/distributor: detect high frequency cores

Message ID 20190222114551.30692-1-david.hunt@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series [v1] examples/distributor: detect high frequency cores |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK
ci/intel-Performance-Testing success Performance Testing PASS
ci/mellanox-Performance-Testing success Performance Testing PASS

Commit Message

Hunt, David Feb. 22, 2019, 11:45 a.m. UTC
  The distributor application is bottlenecked by the distributor core,
so if we can give more frequency to this core, then the overall
performance of the application may increase.

This patch uses the rte_power_get_capabilities() API to query the cores
provided in the core mask, and if any high frequency cores are found
(e.g. Turbo Boost is enabled), we will pin the distributor workload to
that core.

Signed-off-by: Liang Ma <liang.j.ma@intel.com>
Signed-off-by: David Hunt <david.hunt@intel.com>
---
 examples/distributor/main.c      | 185 ++++++++++++++++++++++++-------
 examples/distributor/meson.build |   2 +-
 2 files changed, 149 insertions(+), 38 deletions(-)
  

Comments

Anatoly Burakov March 27, 2019, 1:58 p.m. UTC | #1
On 22-Feb-19 11:45 AM, David Hunt wrote:
> The distributor application is bottlenecked by the distributor core,
> so if we can give more frequency to this core, then the overall
> performance of the application may increase.
> 
> This patch uses the rte_power_get_capabilities() API to query the cores
> provided in the core mask, and if any high frequency cores are found
> (e.g. Turbo Boost is enabled), we will pin the distributor workload to
> that core.
> 
> Signed-off-by: Liang Ma <liang.j.ma@intel.com>
> Signed-off-by: David Hunt <david.hunt@intel.com>
> ---
>   examples/distributor/main.c      | 185 ++++++++++++++++++++++++-------
>   examples/distributor/meson.build |   2 +-
>   2 files changed, 149 insertions(+), 38 deletions(-)
> 
> diff --git a/examples/distributor/main.c b/examples/distributor/main.c
> index 03a05e3d9..0541c50b0 100644
> --- a/examples/distributor/main.c
> +++ b/examples/distributor/main.c
> @@ -16,6 +16,7 @@
>   #include <rte_prefetch.h>
>   #include <rte_distributor.h>
>   #include <rte_pause.h>
> +#include <rte_power.h>
>   
>   #define RX_RING_SIZE 1024
>   #define TX_RING_SIZE 1024
> @@ -281,6 +282,7 @@ lcore_rx(struct lcore_params *p)
>   		if (++port == nb_ports)
>   			port = 0;
>   	}
> +	rte_power_exit(rte_lcore_id());

why is this being added? it doesn't seem relevant to neither the commit 
message nor the feature. if this was missing before, please add it in a 
separate patch. same applies to all other instances where 
rte_power_exit() is added.

also, your app seems to support power and non-power operation. what 
happens when rte_power_exit is called on an lcore that's not been 
initialized (i.e. the fallback to non-power mode)? does this (and other 
rte_power_exit() instances) code only get called when in power mode?

>   	/* set worker & tx threads quit flag */
>   	printf("\nCore %u exiting rx task.\n", rte_lcore_id());
>   	quit_signal = 1;
> @@ -364,6 +366,8 @@ lcore_distributor(struct lcore_params *p)
>   	printf("\nCore %u exiting distributor task.\n", rte_lcore_id());
>   	quit_signal_work = 1;
>   
> +	rte_power_exit(rte_lcore_id());
> +
>   	rte_distributor_flush(d);
>   	/* Unblock any returns so workers can exit */
>   	rte_distributor_clear_returns(d);
> @@ -435,6 +439,7 @@ lcore_tx(struct rte_ring *in_r)
>   			}
>   		}
>   	}
> +	rte_power_exit(rte_lcore_id());
>   	printf("\nCore %u exiting tx task.\n", rte_lcore_id());
>   	return 0;
>   }
> @@ -575,9 +580,32 @@ lcore_worker(struct lcore_params *p)
>   		if (num > 0)
>   			app_stats.worker_bursts[p->worker_id][num-1]++;
>   	}
> +	rte_power_exit(rte_lcore_id());
>   	return 0;
>   }
>   
> +static int
> +init_power_library(void)
> +{
> +	int ret = 0, lcore_id;
> +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {

RTE_LCORE_FOREACH?

> +		if (rte_lcore_is_enabled(lcore_id)) {
> +			/* init power management library */
> +			ret = rte_power_init(lcore_id);
> +			if (ret)
> +				RTE_LOG(ERR, POWER,
> +				"Library initialization failed on core %u\n",
> +				lcore_id);
> +				/*
> +				 * Return on first failure, we'll fall back
> +				 * to non-power operation
> +				 */
> +				return ret;

You'll probably want to fix indentation here, it's misleading.

> +		}
> +	}
> +	return ret;
> +}
> +
>   /* display usage */
>   static void
>   print_usage(const char *prgname)

<...>

> +		 * Here we'll pre-assign lcore ids to the rx, tx and
> +		 * distributor workloads if there's higher frequency
> +		 * on those cores e.g. if Turbo Boost is enabled.
> +		 * It's also worth mentioning that it will assign cores in a
> +		 * specific order, so that if there's less than three
> +		 * available, the higher frequency cores will go to the
> +		 * distributor first, then rx, then tx.
> +		 */
> +		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
> +
> +			rte_power_get_capabilities(lcore_id, &lcore_cap);
> +
> +			if (lcore_cap.turbo == 1) {
> +				priority_num++;
> +				switch (priority_num) {
> +				case 1:
> +					distr_core_id = lcore_id;
> +					printf("Distributor on priority core %d\n",

This says "priority", other instances say "preferred". Which is it? :)

> +							lcore_id);
> +					break;
> +				case 2:
> +					rx_core_id = lcore_id;
> +					printf("Rx on preferred core %d\n",
> +							lcore_id);
> +					break;
> +				case 3:
> +					tx_core_id = lcore_id;
> +					printf("Tx on preferred core %d\n",
> +							lcore_id);
> +					break;
> +				default:
> +					break;
> +				}
> +			}
> +		}
> +	}
> +
> +	/*
> +	 * If there's  any of the key workloads left without an lcore_id

Double space after "there's".

> +	 * after the higer frequency core assignment above, pre-assign
> +	 * them here.
> +	 */
>   	RTE_LCORE_FOREACH_SLAVE(lcore_id) {
> -		if (worker_id == rte_lcore_count() - 3) {
> -			printf("Starting distributor on lcore_id %d\n",
> -					lcore_id);
> -			/* distributor core */
> -			struct lcore_params *p =
> -					rte_malloc(NULL, sizeof(*p), 0);
> -			if (!p)
> -				rte_panic("malloc failure\n");
> -			*p = (struct lcore_params){worker_id, d,
> -				rx_dist_ring, dist_tx_ring, mbuf_pool};
> -			rte_eal_remote_launch(
> -				(lcore_function_t *)lcore_distributor,
> -				p, lcore_id);
> -		} else if (worker_id == rte_lcore_count() - 4) {
> -			printf("Starting tx  on worker_id %d, lcore_id %d\n",
> -					worker_id, lcore_id);
> -			/* tx core */
> -			rte_eal_remote_launch((lcore_function_t *)lcore_tx,
> -					dist_tx_ring, lcore_id);
> -		} else if (worker_id == rte_lcore_count() - 2) {
> -			printf("Starting rx on worker_id %d, lcore_id %d\n",
> -					worker_id, lcore_id);
> -			/* rx core */
> -			struct lcore_params *p =
> -					rte_malloc(NULL, sizeof(*p), 0);
> -			if (!p)
> -				rte_panic("malloc failure\n");
> -			*p = (struct lcore_params){worker_id, d, rx_dist_ring,
> -					dist_tx_ring, mbuf_pool};
> -			rte_eal_remote_launch((lcore_function_t *)lcore_rx,
> -					p, lcore_id);
> +
> +		if (distr_core_id == 0) {

0 is a valid core id. You would probably want to use -1 here.

> +			distr_core_id = lcore_id;
> +			printf("Distributor on core %d\n", lcore_id);
> +		}
> +		if ((rx_core_id == 0) &&
> +				(lcore_id != distr_core_id)) {

You could just check if (lcore_id == distr_core_id || lcore_id == 
rx_core_id || lcore_id == tx_core_id) and skip the iteration entirely, 
rather than checking at every step.

> +			rx_core_id = lcore_id;
> +			printf("Rx on core %d\n", lcore_id);
> +		}
> +		if ((tx_core_id == 0) &&
> +				(lcore_id != distr_core_id) &&
> +				(lcore_id != rx_core_id)) {
> +			tx_core_id = lcore_id;
> +			printf("Tx on core %d\n", lcore_id);
> +		}
> +		counter++;
> +	}
> +
> +	printf(" tx id %d, dist id %d, rx id %d\n",
> +			tx_core_id,
> +			distr_core_id,
> +			rx_core_id);
> +
> +	/*
> +	 * Kick off all the worker threads first, avoiding the pre-assigned
> +	 * lcore_ids for tx, rx and distributor workloads.
> +	 */
> +	RTE_LCORE_FOREACH_SLAVE(lcore_id) {
> +
> +		if ((lcore_id == distr_core_id) ||
> +			(lcore_id == rx_core_id) ||
> +			(lcore_id == tx_core_id)) {
> +
>   		} else {

This is a very unorthodox way of skipping an iteration :)
  
Hunt, David March 28, 2019, 10:20 a.m. UTC | #2
Hi Anatoly,

On 27/3/2019 1:58 PM, Burakov, Anatoly wrote:
> On 22-Feb-19 11:45 AM, David Hunt wrote:
>> The distributor application is bottlenecked by the distributor core,
>> so if we can give more frequency to this core, then the overall
>> performance of the application may increase.
>>
>> This patch uses the rte_power_get_capabilities() API to query the cores
>> provided in the core mask, and if any high frequency cores are found
>> (e.g. Turbo Boost is enabled), we will pin the distributor workload to
>> that core.
>>
>> Signed-off-by: Liang Ma <liang.j.ma@intel.com>
>> Signed-off-by: David Hunt <david.hunt@intel.com>
>> ---
>>   examples/distributor/main.c      | 185 ++++++++++++++++++++++++-------
>>   examples/distributor/meson.build |   2 +-
>>   2 files changed, 149 insertions(+), 38 deletions(-)
>>
>> diff --git a/examples/distributor/main.c b/examples/distributor/main.c
>> index 03a05e3d9..0541c50b0 100644
>> --- a/examples/distributor/main.c
>> +++ b/examples/distributor/main.c
>> @@ -16,6 +16,7 @@
>>   #include <rte_prefetch.h>
>>   #include <rte_distributor.h>
>>   #include <rte_pause.h>
>> +#include <rte_power.h>
>>     #define RX_RING_SIZE 1024
>>   #define TX_RING_SIZE 1024
>> @@ -281,6 +282,7 @@ lcore_rx(struct lcore_params *p)
>>           if (++port == nb_ports)
>>               port = 0;
>>       }
>> +    rte_power_exit(rte_lcore_id());
>
> why is this being added? it doesn't seem relevant to neither the 
> commit message nor the feature. if this was missing before, please add 
> it in a separate patch. same applies to all other instances where 
> rte_power_exit() is added.


I'll make "power_lib_initialised" a global, and check that's set before 
calling the rte_power_exit()


>
> also, your app seems to support power and non-power operation. what 
> happens when rte_power_exit is called on an lcore that's not been 
> initialized (i.e. the fallback to non-power mode)? does this (and 
> other rte_power_exit() instances) code only get called when in power 
> mode?

No issue with calling it on a non-power-enabled core, but I'll make it 
conditional anyway.


>
>>       /* set worker & tx threads quit flag */
>>       printf("\nCore %u exiting rx task.\n", rte_lcore_id());
>>       quit_signal = 1;
>> @@ -364,6 +366,8 @@ lcore_distributor(struct lcore_params *p)
>>       printf("\nCore %u exiting distributor task.\n", rte_lcore_id());
>>       quit_signal_work = 1;
>>   +    rte_power_exit(rte_lcore_id());
>> +
>>       rte_distributor_flush(d);
>>       /* Unblock any returns so workers can exit */
>>       rte_distributor_clear_returns(d);
>> @@ -435,6 +439,7 @@ lcore_tx(struct rte_ring *in_r)
>>               }
>>           }
>>       }
>> +    rte_power_exit(rte_lcore_id());
>>       printf("\nCore %u exiting tx task.\n", rte_lcore_id());
>>       return 0;
>>   }
>> @@ -575,9 +580,32 @@ lcore_worker(struct lcore_params *p)
>>           if (num > 0)
>>               app_stats.worker_bursts[p->worker_id][num-1]++;
>>       }
>> +    rte_power_exit(rte_lcore_id());
>>       return 0;
>>   }
>>   +static int
>> +init_power_library(void)
>> +{
>> +    int ret = 0, lcore_id;
>> +    for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
>
> RTE_LCORE_FOREACH?


Done in v2


>
>> +        if (rte_lcore_is_enabled(lcore_id)) {
>> +            /* init power management library */
>> +            ret = rte_power_init(lcore_id);
>> +            if (ret)
>> +                RTE_LOG(ERR, POWER,
>> +                "Library initialization failed on core %u\n",
>> +                lcore_id);
>> +                /*
>> +                 * Return on first failure, we'll fall back
>> +                 * to non-power operation
>> +                 */
>> +                return ret;
>
> You'll probably want to fix indentation here, it's misleading.


Fixed in v2. I also added braces around the RTE_LOG and return(). :)


>
>> +        }
>> +    }
>> +    return ret;
>> +}
>> +
>>   /* display usage */
>>   static void
>>   print_usage(const char *prgname)
>
> <...>
>
>> +         * Here we'll pre-assign lcore ids to the rx, tx and
>> +         * distributor workloads if there's higher frequency
>> +         * on those cores e.g. if Turbo Boost is enabled.
>> +         * It's also worth mentioning that it will assign cores in a
>> +         * specific order, so that if there's less than three
>> +         * available, the higher frequency cores will go to the
>> +         * distributor first, then rx, then tx.
>> +         */
>> +        RTE_LCORE_FOREACH_SLAVE(lcore_id) {
>> +
>> +            rte_power_get_capabilities(lcore_id, &lcore_cap);
>> +
>> +            if (lcore_cap.turbo == 1) {
>> +                priority_num++;
>> +                switch (priority_num) {
>> +                case 1:
>> +                    distr_core_id = lcore_id;
>> +                    printf("Distributor on priority core %d\n",
>
> This says "priority", other instances say "preferred". Which is it? :)


Will change to priority.


>
>> +                            lcore_id);
>> +                    break;
>> +                case 2:
>> +                    rx_core_id = lcore_id;
>> +                    printf("Rx on preferred core %d\n",
>> +                            lcore_id);
>> +                    break;
>> +                case 3:
>> +                    tx_core_id = lcore_id;
>> +                    printf("Tx on preferred core %d\n",
>> +                            lcore_id);
>> +                    break;
>> +                default:
>> +                    break;
>> +                }
>> +            }
>> +        }
>> +    }
>> +
>> +    /*
>> +     * If there's  any of the key workloads left without an lcore_id
>
> Double space after "there's".


Fixed in v2


>
>> +     * after the higer frequency core assignment above, pre-assign
>> +     * them here.
>> +     */
>>       RTE_LCORE_FOREACH_SLAVE(lcore_id) {
>> -        if (worker_id == rte_lcore_count() - 3) {
>> -            printf("Starting distributor on lcore_id %d\n",
>> -                    lcore_id);
>> -            /* distributor core */
>> -            struct lcore_params *p =
>> -                    rte_malloc(NULL, sizeof(*p), 0);
>> -            if (!p)
>> -                rte_panic("malloc failure\n");
>> -            *p = (struct lcore_params){worker_id, d,
>> -                rx_dist_ring, dist_tx_ring, mbuf_pool};
>> -            rte_eal_remote_launch(
>> -                (lcore_function_t *)lcore_distributor,
>> -                p, lcore_id);
>> -        } else if (worker_id == rte_lcore_count() - 4) {
>> -            printf("Starting tx  on worker_id %d, lcore_id %d\n",
>> -                    worker_id, lcore_id);
>> -            /* tx core */
>> -            rte_eal_remote_launch((lcore_function_t *)lcore_tx,
>> -                    dist_tx_ring, lcore_id);
>> -        } else if (worker_id == rte_lcore_count() - 2) {
>> -            printf("Starting rx on worker_id %d, lcore_id %d\n",
>> -                    worker_id, lcore_id);
>> -            /* rx core */
>> -            struct lcore_params *p =
>> -                    rte_malloc(NULL, sizeof(*p), 0);
>> -            if (!p)
>> -                rte_panic("malloc failure\n");
>> -            *p = (struct lcore_params){worker_id, d, rx_dist_ring,
>> -                    dist_tx_ring, mbuf_pool};
>> -            rte_eal_remote_launch((lcore_function_t *)lcore_rx,
>> -                    p, lcore_id);
>> +
>> +        if (distr_core_id == 0) {
>
> 0 is a valid core id. You would probably want to use -1 here.


I've changed to int using -1 for invalid cores across the app.


>
>> +            distr_core_id = lcore_id;
>> +            printf("Distributor on core %d\n", lcore_id);
>> +        }
>> +        if ((rx_core_id == 0) &&
>> +                (lcore_id != distr_core_id)) {
>
> You could just check if (lcore_id == distr_core_id || lcore_id == 
> rx_core_id || lcore_id == tx_core_id) and skip the iteration entirely, 
> rather than checking at every step.


Done in v2.


>
>> +            rx_core_id = lcore_id;
>> +            printf("Rx on core %d\n", lcore_id);
>> +        }
>> +        if ((tx_core_id == 0) &&
>> +                (lcore_id != distr_core_id) &&
>> +                (lcore_id != rx_core_id)) {
>> +            tx_core_id = lcore_id;
>> +            printf("Tx on core %d\n", lcore_id);
>> +        }
>> +        counter++;
>> +    }
>> +
>> +    printf(" tx id %d, dist id %d, rx id %d\n",
>> +            tx_core_id,
>> +            distr_core_id,
>> +            rx_core_id);
>> +
>> +    /*
>> +     * Kick off all the worker threads first, avoiding the pre-assigned
>> +     * lcore_ids for tx, rx and distributor workloads.
>> +     */
>> +    RTE_LCORE_FOREACH_SLAVE(lcore_id) {
>> +
>> +        if ((lcore_id == distr_core_id) ||
>> +            (lcore_id == rx_core_id) ||
>> +            (lcore_id == tx_core_id)) {
>> +
>>           } else {
>
> This is a very unorthodox way of skipping an iteration :)
>

Fixed in v2 to be like you're previous suggestion above, using continue.


Thanks for the review, v2 coming in a few hours.

Rgds,

Dave.
  

Patch

diff --git a/examples/distributor/main.c b/examples/distributor/main.c
index 03a05e3d9..0541c50b0 100644
--- a/examples/distributor/main.c
+++ b/examples/distributor/main.c
@@ -16,6 +16,7 @@ 
 #include <rte_prefetch.h>
 #include <rte_distributor.h>
 #include <rte_pause.h>
+#include <rte_power.h>
 
 #define RX_RING_SIZE 1024
 #define TX_RING_SIZE 1024
@@ -281,6 +282,7 @@  lcore_rx(struct lcore_params *p)
 		if (++port == nb_ports)
 			port = 0;
 	}
+	rte_power_exit(rte_lcore_id());
 	/* set worker & tx threads quit flag */
 	printf("\nCore %u exiting rx task.\n", rte_lcore_id());
 	quit_signal = 1;
@@ -364,6 +366,8 @@  lcore_distributor(struct lcore_params *p)
 	printf("\nCore %u exiting distributor task.\n", rte_lcore_id());
 	quit_signal_work = 1;
 
+	rte_power_exit(rte_lcore_id());
+
 	rte_distributor_flush(d);
 	/* Unblock any returns so workers can exit */
 	rte_distributor_clear_returns(d);
@@ -435,6 +439,7 @@  lcore_tx(struct rte_ring *in_r)
 			}
 		}
 	}
+	rte_power_exit(rte_lcore_id());
 	printf("\nCore %u exiting tx task.\n", rte_lcore_id());
 	return 0;
 }
@@ -575,9 +580,32 @@  lcore_worker(struct lcore_params *p)
 		if (num > 0)
 			app_stats.worker_bursts[p->worker_id][num-1]++;
 	}
+	rte_power_exit(rte_lcore_id());
 	return 0;
 }
 
+static int
+init_power_library(void)
+{
+	int ret = 0, lcore_id;
+	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+		if (rte_lcore_is_enabled(lcore_id)) {
+			/* init power management library */
+			ret = rte_power_init(lcore_id);
+			if (ret)
+				RTE_LOG(ERR, POWER,
+				"Library initialization failed on core %u\n",
+				lcore_id);
+				/*
+				 * Return on first failure, we'll fall back
+				 * to non-power operation
+				 */
+				return ret;
+		}
+	}
+	return ret;
+}
+
 /* display usage */
 static void
 print_usage(const char *prgname)
@@ -657,11 +685,15 @@  main(int argc, char *argv[])
 	struct rte_distributor *d;
 	struct rte_ring *dist_tx_ring;
 	struct rte_ring *rx_dist_ring;
-	unsigned lcore_id, worker_id = 0;
+	struct rte_power_core_capabilities lcore_cap;
+	unsigned int lcore_id, worker_id = 0, priority_num = 0;
+	unsigned int distr_core_id = 0, rx_core_id = 0, tx_core_id = 0;
 	unsigned nb_ports;
 	uint16_t portid;
 	uint16_t nb_ports_available;
 	uint64_t t, freq;
+	unsigned int counter = 0;
+	unsigned int power_lib_initialised = 0;
 
 	/* catch ctrl-c so we can print on exit */
 	signal(SIGINT, int_handler);
@@ -687,6 +719,9 @@  main(int argc, char *argv[])
 				"1 lcore for packet TX\n"
 				"and at least 1 lcore for worker threads\n");
 
+	if (init_power_library() == 0)
+		power_lib_initialised = 1;
+
 	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports == 0)
 		rte_exit(EXIT_FAILURE, "Error: no ethernet ports detected\n");
@@ -742,54 +777,126 @@  main(int argc, char *argv[])
 	if (rx_dist_ring == NULL)
 		rte_exit(EXIT_FAILURE, "Cannot create output ring\n");
 
+	if (power_lib_initialised) {
+		/*
+		 * Here we'll pre-assign lcore ids to the rx, tx and
+		 * distributor workloads if there's higher frequency
+		 * on those cores e.g. if Turbo Boost is enabled.
+		 * It's also worth mentioning that it will assign cores in a
+		 * specific order, so that if there's less than three
+		 * available, the higher frequency cores will go to the
+		 * distributor first, then rx, then tx.
+		 */
+		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
+
+			rte_power_get_capabilities(lcore_id, &lcore_cap);
+
+			if (lcore_cap.turbo == 1) {
+				priority_num++;
+				switch (priority_num) {
+				case 1:
+					distr_core_id = lcore_id;
+					printf("Distributor on priority core %d\n",
+							lcore_id);
+					break;
+				case 2:
+					rx_core_id = lcore_id;
+					printf("Rx on preferred core %d\n",
+							lcore_id);
+					break;
+				case 3:
+					tx_core_id = lcore_id;
+					printf("Tx on preferred core %d\n",
+							lcore_id);
+					break;
+				default:
+					break;
+				}
+			}
+		}
+	}
+
+	/*
+	 * If there's  any of the key workloads left without an lcore_id
+	 * after the higer frequency core assignment above, pre-assign
+	 * them here.
+	 */
 	RTE_LCORE_FOREACH_SLAVE(lcore_id) {
-		if (worker_id == rte_lcore_count() - 3) {
-			printf("Starting distributor on lcore_id %d\n",
-					lcore_id);
-			/* distributor core */
-			struct lcore_params *p =
-					rte_malloc(NULL, sizeof(*p), 0);
-			if (!p)
-				rte_panic("malloc failure\n");
-			*p = (struct lcore_params){worker_id, d,
-				rx_dist_ring, dist_tx_ring, mbuf_pool};
-			rte_eal_remote_launch(
-				(lcore_function_t *)lcore_distributor,
-				p, lcore_id);
-		} else if (worker_id == rte_lcore_count() - 4) {
-			printf("Starting tx  on worker_id %d, lcore_id %d\n",
-					worker_id, lcore_id);
-			/* tx core */
-			rte_eal_remote_launch((lcore_function_t *)lcore_tx,
-					dist_tx_ring, lcore_id);
-		} else if (worker_id == rte_lcore_count() - 2) {
-			printf("Starting rx on worker_id %d, lcore_id %d\n",
-					worker_id, lcore_id);
-			/* rx core */
-			struct lcore_params *p =
-					rte_malloc(NULL, sizeof(*p), 0);
-			if (!p)
-				rte_panic("malloc failure\n");
-			*p = (struct lcore_params){worker_id, d, rx_dist_ring,
-					dist_tx_ring, mbuf_pool};
-			rte_eal_remote_launch((lcore_function_t *)lcore_rx,
-					p, lcore_id);
+
+		if (distr_core_id == 0) {
+			distr_core_id = lcore_id;
+			printf("Distributor on core %d\n", lcore_id);
+		}
+		if ((rx_core_id == 0) &&
+				(lcore_id != distr_core_id)) {
+			rx_core_id = lcore_id;
+			printf("Rx on core %d\n", lcore_id);
+		}
+		if ((tx_core_id == 0) &&
+				(lcore_id != distr_core_id) &&
+				(lcore_id != rx_core_id)) {
+			tx_core_id = lcore_id;
+			printf("Tx on core %d\n", lcore_id);
+		}
+		counter++;
+	}
+
+	printf(" tx id %d, dist id %d, rx id %d\n",
+			tx_core_id,
+			distr_core_id,
+			rx_core_id);
+
+	/*
+	 * Kick off all the worker threads first, avoiding the pre-assigned
+	 * lcore_ids for tx, rx and distributor workloads.
+	 */
+	RTE_LCORE_FOREACH_SLAVE(lcore_id) {
+
+		if ((lcore_id == distr_core_id) ||
+			(lcore_id == rx_core_id) ||
+			(lcore_id == tx_core_id)) {
+
 		} else {
-			printf("Starting worker on worker_id %d, lcore_id %d\n",
+
+			printf("Starting thread %d as worker, lcore_id %d\n",
 					worker_id, lcore_id);
 			struct lcore_params *p =
-					rte_malloc(NULL, sizeof(*p), 0);
+				rte_malloc(NULL, sizeof(*p), 0);
 			if (!p)
 				rte_panic("malloc failure\n");
-			*p = (struct lcore_params){worker_id, d, rx_dist_ring,
-					dist_tx_ring, mbuf_pool};
+			*p = (struct lcore_params){worker_id++, d, rx_dist_ring,
+				dist_tx_ring, mbuf_pool};
 
 			rte_eal_remote_launch((lcore_function_t *)lcore_worker,
 					p, lcore_id);
 		}
-		worker_id++;
 	}
 
+	/* Start tx core */
+	rte_eal_remote_launch((lcore_function_t *)lcore_tx,
+			dist_tx_ring, tx_core_id);
+
+	/* Start distributor core */
+	struct lcore_params *pd =
+		rte_malloc(NULL, sizeof(*pd), 0);
+	if (!pd)
+		rte_panic("malloc failure\n");
+	*pd = (struct lcore_params){worker_id++, d,
+		rx_dist_ring, dist_tx_ring, mbuf_pool};
+	rte_eal_remote_launch(
+			(lcore_function_t *)lcore_distributor,
+			pd, distr_core_id);
+
+	/* Start rx core */
+	struct lcore_params *pr =
+		rte_malloc(NULL, sizeof(*pr), 0);
+	if (!pr)
+		rte_panic("malloc failure\n");
+	*pr = (struct lcore_params){worker_id++, d, rx_dist_ring,
+		dist_tx_ring, mbuf_pool};
+	rte_eal_remote_launch((lcore_function_t *)lcore_rx,
+			pr, rx_core_id);
+
 	freq = rte_get_timer_hz();
 	t = rte_rdtsc() + freq;
 	while (!quit_signal_dist) {
@@ -806,5 +913,9 @@  main(int argc, char *argv[])
 	}
 
 	print_stats();
+
+	rte_free(pd);
+	rte_free(pr);
+
 	return 0;
 }
diff --git a/examples/distributor/meson.build b/examples/distributor/meson.build
index 88c001f56..8cf2ca1da 100644
--- a/examples/distributor/meson.build
+++ b/examples/distributor/meson.build
@@ -6,7 +6,7 @@ 
 # To build this example as a standalone application with an already-installed
 # DPDK instance, use 'make'
 
-deps += 'distributor'
+deps += ['distributor', 'power']
 sources = files(
 	'main.c'
 )