[v7,15/17] examples/l3fwd-graph: introduce multicore dispatch worker model
Checks
Commit Message
Add new parameter "model" to choose mcore dispatch or rtc model.
And in dispatch model, the node will affinity to worker core successively.
Note:
RTE_GRAPH_MODEL_SELECT is set to GRAPH_MODEL_RTC by default. Must set
model the same as RTE_GRAPH_MODEL_SELECT If set it as rtc or mcore
dispatch explicitly. GRAPH_MODEL_MCORE_RUNTIME_SELECT means it could
choose by model in runtime.
Only support one RX node for mcore dispatch model in current
implementation.
./dpdk-l3fwd-graph -l 8,9,10,11 -n 4 -- -p 0x1 --config="(0,0,9)" -P
--model="dispatch"
Signed-off-by: Haiyue Wang <haiyue.wang@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
Signed-off-by: Zhirun Yan <zhirun.yan@intel.com>
---
examples/l3fwd-graph/main.c | 231 +++++++++++++++++++++++++++++------
lib/graph/rte_graph_worker.h | 3 +
2 files changed, 196 insertions(+), 38 deletions(-)
Comments
On Mon, Jun 5, 2023 at 4:57 PM Zhirun Yan <zhirun.yan@intel.com> wrote:
>
> Add new parameter "model" to choose mcore dispatch or rtc model.
> And in dispatch model, the node will affinity to worker core successively.
>
> Note:
> RTE_GRAPH_MODEL_SELECT is set to GRAPH_MODEL_RTC by default. Must set
> model the same as RTE_GRAPH_MODEL_SELECT If set it as rtc or mcore
> dispatch explicitly. GRAPH_MODEL_MCORE_RUNTIME_SELECT means it could
> choose by model in runtime.
> Only support one RX node for mcore dispatch model in current
> implementation.
>
> ./dpdk-l3fwd-graph -l 8,9,10,11 -n 4 -- -p 0x1 --config="(0,0,9)" -P
> --model="dispatch"
>
> Signed-off-by: Haiyue Wang <haiyue.wang@intel.com>
> Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> Signed-off-by: Zhirun Yan <zhirun.yan@intel.com>
> ---
> examples/l3fwd-graph/main.c | 231 +++++++++++++++++++++++++++++------
> lib/graph/rte_graph_worker.h | 3 +
> 2 files changed, 196 insertions(+), 38 deletions(-)
>
> diff --git a/examples/l3fwd-graph/main.c b/examples/l3fwd-graph/main.c
> index 5feeab4f0f..4ecc6c9af4 100644
> --- a/examples/l3fwd-graph/main.c
> +++ b/examples/l3fwd-graph/main.c
> @@ -23,6 +23,12 @@
> #include <rte_cycles.h>
> #include <rte_eal.h>
> #include <rte_ethdev.h>
> +#define GRAPH_MODEL_RTC 0 /* Run-to-completion model, set by default. */
> +#define GRAPH_MODEL_MCORE_DISPATCH 1 /* Dispatch model. */
> +#define GRAPH_MODEL_MCORE_RUNTIME_SELECT 2 /* Support to select model by */
> + /* parsing model in cmdline. */
After moving model to graph->model, Can you check the performance.
This may not be needed for l3fwd
or if there is not much code duplication,
Do the following remove the limitation,
#define RTE_GRAPH_MODEL_SELECT RTE_GRAPH_MODEL_RTC.
graph_main_loop change to graph_main_rtc_loop
#define RTE_GRAPH_MODEL_SELECT GRAPH_MODEL_MCORE_DISPATCH
graph_main_loop change to graph_main_mcore_loop
Select the following based on runtime option
/* Launch per-lcore init on every worker lcore */
rte_eal_mp_remote_launch(graph_main_rtc_loop, NULL, SKIP_MAIN);
or
rte_eal_mp_remote_launch(graph_main_mcore_loop, NULL, SKIP_MAIN);
> memset(&rewrite_data, 0, sizeof(rewrite_data));
> rewrite_len = sizeof(rewrite_data);
> diff --git a/lib/graph/rte_graph_worker.h b/lib/graph/rte_graph_worker.h
> index 541c373cb1..19b4c1514f 100644
> --- a/lib/graph/rte_graph_worker.h
> +++ b/lib/graph/rte_graph_worker.h
> @@ -26,6 +26,9 @@ __rte_experimental
> static inline void
> rte_graph_walk(struct rte_graph *graph)
> {
> +#define RTE_GRAPH_MODEL_RTC 0
> +#define RTE_GRAPH_MODEL_MCORE_DISPATCH 1
No need for duplicate enum. Please remove enum make this as in public
header file.
> +
Add comment here, On how application uses this, aka. before
inlcuding the worker header file #define RTE_GRAPH_MODEL_SELECT
RTE_GRAPH_MODEL_RTC.
Please change the text as needed.
> #if !defined(RTE_GRAPH_MODEL_SELECT) || RTE_GRAPH_MODEL_SELECT == RTE_GRAPH_MODEL_RTC
> rte_graph_walk_rtc(graph);
> #elif RTE_GRAPH_MODEL_SELECT == RTE_GRAPH_MODEL_MCORE_DISPATCH
> --
> 2.37.2
>
> -----Original Message-----
> From: Jerin Jacob <jerinjacobk@gmail.com>
> Sent: Monday, June 5, 2023 9:42 PM
> To: Yan, Zhirun <zhirun.yan@intel.com>
> Cc: dev@dpdk.org; jerinj@marvell.com; kirankumark@marvell.com;
> ndabilpuram@marvell.com; stephen@networkplumber.org;
> pbhagavatula@marvell.com; Liang, Cunming <cunming.liang@intel.com>; Wang,
> Haiyue <haiyue.wang@intel.com>; mattias.ronnblom
> <mattias.ronnblom@ericsson.com>
> Subject: Re: [PATCH v7 15/17] examples/l3fwd-graph: introduce multicore
> dispatch worker model
>
> On Mon, Jun 5, 2023 at 4:57 PM Zhirun Yan <zhirun.yan@intel.com> wrote:
> >
> > Add new parameter "model" to choose mcore dispatch or rtc model.
> > And in dispatch model, the node will affinity to worker core successively.
> >
> > Note:
> > RTE_GRAPH_MODEL_SELECT is set to GRAPH_MODEL_RTC by default. Must
> set
> > model the same as RTE_GRAPH_MODEL_SELECT If set it as rtc or mcore
> > dispatch explicitly. GRAPH_MODEL_MCORE_RUNTIME_SELECT means it could
> > choose by model in runtime.
> > Only support one RX node for mcore dispatch model in current
> > implementation.
> >
> > ./dpdk-l3fwd-graph -l 8,9,10,11 -n 4 -- -p 0x1 --config="(0,0,9)" -P
> > --model="dispatch"
> >
> > Signed-off-by: Haiyue Wang <haiyue.wang@intel.com>
> > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > Signed-off-by: Zhirun Yan <zhirun.yan@intel.com>
> > ---
> > examples/l3fwd-graph/main.c | 231 +++++++++++++++++++++++++++++------
> > lib/graph/rte_graph_worker.h | 3 +
> > 2 files changed, 196 insertions(+), 38 deletions(-)
> >
> > diff --git a/examples/l3fwd-graph/main.c b/examples/l3fwd-graph/main.c
> > index 5feeab4f0f..4ecc6c9af4 100644
> > --- a/examples/l3fwd-graph/main.c
> > +++ b/examples/l3fwd-graph/main.c
> > @@ -23,6 +23,12 @@
> > #include <rte_cycles.h>
> > #include <rte_eal.h>
> > #include <rte_ethdev.h>
> > +#define GRAPH_MODEL_RTC 0 /* Run-to-completion model, set by default.
> > +*/ #define GRAPH_MODEL_MCORE_DISPATCH 1 /* Dispatch model. */
> #define
> > +GRAPH_MODEL_MCORE_RUNTIME_SELECT 2 /* Support to select model by
> */
> > + /* parsing model in
> > +cmdline. */
>
> After moving model to graph->model, Can you check the performance.
In my env, I test l3fwd-graph, I got the same throughput.(slight improve could be treated as jitter)
For graph_perf_autotest in test app, there is slight drop (About 0.2% call, similar cycles/call)
Can it be treated as jitter?
Old:
+-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
|Node |calls |objs |realloc_count |objs/call |objs/sec(10E6) |cycles/call|
+-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
|test_graph_perf_worker-0-0 |10175176 |2604845056 |1 |256.000 |2015.394304 |27.0000 |
|test_graph_perf_worker-1-0 |10175542 |2604938752 |1 |256.000 |2015.488000 |28.0000 |
|test_graph_perf_worker-2-0 |10175565 |2604944640 |1 |256.000 |2015.493888 |28.0000 |
|test_graph_perf_worker-3-0 |10175593 |2604951808 |1 |256.000 |2015.501056 |27.0000 |
|test_graph_perf_source-0 |10175623 |2604959488 |2 |256.000 |2015.508480 |27.0000 |
|test_graph_perf_sink-0 |10175642 |2604964352 |1 |256.000 |2015.513600 |27.0000 |
+-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
New:
+-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
|Node |calls |objs |realloc_count |objs/call |objs/sec(10E6) |cycles/call|
+-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
|test_graph_perf_worker-0-0 |10154953 |2599667968 |1 |256.000 |2010.960128 |27.0000 |
|test_graph_perf_worker-1-0 |10155316 |2599760896 |1 |256.000 |2011.053056 |27.0000 |
|test_graph_perf_worker-2-0 |10155338 |2599766528 |1 |256.000 |2011.058688 |28.0000 |
|test_graph_perf_worker-3-0 |10155357 |2599771392 |1 |256.000 |2011.063552 |28.0000 |
|test_graph_perf_source-0 |10155394 |2599780864 |2 |256.000 |2011.072768 |27.0000 |
|test_graph_perf_sink-0 |10155422 |2599788032 |1 |256.000 |2011.080192 |27.0000 |
+-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
> This may not be needed for l3fwd
>
Do you mean graph->model?
> or if there is not much code duplication,
>
> Do the following remove the limitation,
> #define RTE_GRAPH_MODEL_SELECT RTE_GRAPH_MODEL_RTC.
>
> graph_main_loop change to graph_main_rtc_loop
>
> #define RTE_GRAPH_MODEL_SELECT GRAPH_MODEL_MCORE_DISPATCH
>
> graph_main_loop change to graph_main_mcore_loop
>
> Select the following based on runtime option
> /* Launch per-lcore init on every worker lcore */
> rte_eal_mp_remote_launch(graph_main_rtc_loop, NULL, SKIP_MAIN); or
> rte_eal_mp_remote_launch(graph_main_mcore_loop, NULL, SKIP_MAIN);
>
We want to 1. Use same API (rte_graph_walk()) for diff models.
2. no performance drop for rtc (use RTE_GRAPH_MODEL_SELECT in compile time)
If I understand correctly, I need remove graph->model and only use
RTE_GRAPH_MODEL_SELECT to select models?
And change it as
graph_main_rtc_loop()
{
While(1)
rte_graph_walk_rtc()
}
But actually, I think graph->model is need, especially for config stage and for runtime config
If set RTE_GRAPH_MODEL_SELECT_RUNTIME.
We need the model type to decide to alloc workqueue and use RTE_GRAPH_MODEL_SELECT
to choose the walk.
> > memset(&rewrite_data, 0, sizeof(rewrite_data));
> > rewrite_len = sizeof(rewrite_data); diff --git
> > a/lib/graph/rte_graph_worker.h b/lib/graph/rte_graph_worker.h index
> > 541c373cb1..19b4c1514f 100644
> > --- a/lib/graph/rte_graph_worker.h
> > +++ b/lib/graph/rte_graph_worker.h
> > @@ -26,6 +26,9 @@ __rte_experimental
> > static inline void
> > rte_graph_walk(struct rte_graph *graph) {
> > +#define RTE_GRAPH_MODEL_RTC 0
> > +#define RTE_GRAPH_MODEL_MCORE_DISPATCH 1
>
> No need for duplicate enum. Please remove enum make this as in public header
> file.
>
Yes, it will cause no defined warnings.
Thanks for your comments.
I will remove enum and define model type macros in public header. And also change
the related structs/APIs.
>
> > +
>
> Add comment here, On how application uses this, aka. before inlcuding the
> worker header file #define RTE_GRAPH_MODEL_SELECT
> RTE_GRAPH_MODEL_RTC.
> Please change the text as needed.
Yes, I will add comment and add the usage in document.
>
>
> > #if !defined(RTE_GRAPH_MODEL_SELECT) || RTE_GRAPH_MODEL_SELECT ==
> RTE_GRAPH_MODEL_RTC
> > rte_graph_walk_rtc(graph);
> > #elif RTE_GRAPH_MODEL_SELECT ==
> RTE_GRAPH_MODEL_MCORE_DISPATCH
> > --
> > 2.37.2
> >
On Tue, Jun 6, 2023 at 10:41 AM Yan, Zhirun <zhirun.yan@intel.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Jerin Jacob <jerinjacobk@gmail.com>
> > Sent: Monday, June 5, 2023 9:42 PM
> > To: Yan, Zhirun <zhirun.yan@intel.com>
> > Cc: dev@dpdk.org; jerinj@marvell.com; kirankumark@marvell.com;
> > ndabilpuram@marvell.com; stephen@networkplumber.org;
> > pbhagavatula@marvell.com; Liang, Cunming <cunming.liang@intel.com>; Wang,
> > Haiyue <haiyue.wang@intel.com>; mattias.ronnblom
> > <mattias.ronnblom@ericsson.com>
> > Subject: Re: [PATCH v7 15/17] examples/l3fwd-graph: introduce multicore
> > dispatch worker model
> >
> > On Mon, Jun 5, 2023 at 4:57 PM Zhirun Yan <zhirun.yan@intel.com> wrote:
> > >
> > > Add new parameter "model" to choose mcore dispatch or rtc model.
> > > And in dispatch model, the node will affinity to worker core successively.
> > >
> > > Note:
> > > RTE_GRAPH_MODEL_SELECT is set to GRAPH_MODEL_RTC by default. Must
> > set
> > > model the same as RTE_GRAPH_MODEL_SELECT If set it as rtc or mcore
> > > dispatch explicitly. GRAPH_MODEL_MCORE_RUNTIME_SELECT means it could
> > > choose by model in runtime.
> > > Only support one RX node for mcore dispatch model in current
> > > implementation.
> > >
> > > ./dpdk-l3fwd-graph -l 8,9,10,11 -n 4 -- -p 0x1 --config="(0,0,9)" -P
> > > --model="dispatch"
> > >
> > > Signed-off-by: Haiyue Wang <haiyue.wang@intel.com>
> > > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > > Signed-off-by: Zhirun Yan <zhirun.yan@intel.com>
> > > ---
> > > examples/l3fwd-graph/main.c | 231 +++++++++++++++++++++++++++++------
> > > lib/graph/rte_graph_worker.h | 3 +
> > > 2 files changed, 196 insertions(+), 38 deletions(-)
> > >
> > > diff --git a/examples/l3fwd-graph/main.c b/examples/l3fwd-graph/main.c
> > > index 5feeab4f0f..4ecc6c9af4 100644
> > > --- a/examples/l3fwd-graph/main.c
> > > +++ b/examples/l3fwd-graph/main.c
> > > @@ -23,6 +23,12 @@
> > > #include <rte_cycles.h>
> > > #include <rte_eal.h>
> > > #include <rte_ethdev.h>
> > > +#define GRAPH_MODEL_RTC 0 /* Run-to-completion model, set by default.
> > > +*/ #define GRAPH_MODEL_MCORE_DISPATCH 1 /* Dispatch model. */
> > #define
> > > +GRAPH_MODEL_MCORE_RUNTIME_SELECT 2 /* Support to select model by
> > */
> > > + /* parsing model in
> > > +cmdline. */
> >
> > After moving model to graph->model, Can you check the performance.
>
> In my env, I test l3fwd-graph, I got the same throughput.(slight improve could be treated as jitter)
> For graph_perf_autotest in test app, there is slight drop (About 0.2% call, similar cycles/call)
> Can it be treated as jitter?
Most likely.
Try in following in fasth path.
const ... model = graph->model;
>
> Old:
> +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
> |Node |calls |objs |realloc_count |objs/call |objs/sec(10E6) |cycles/call|
> +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
> |test_graph_perf_worker-0-0 |10175176 |2604845056 |1 |256.000 |2015.394304 |27.0000 |
> |test_graph_perf_worker-1-0 |10175542 |2604938752 |1 |256.000 |2015.488000 |28.0000 |
> |test_graph_perf_worker-2-0 |10175565 |2604944640 |1 |256.000 |2015.493888 |28.0000 |
> |test_graph_perf_worker-3-0 |10175593 |2604951808 |1 |256.000 |2015.501056 |27.0000 |
> |test_graph_perf_source-0 |10175623 |2604959488 |2 |256.000 |2015.508480 |27.0000 |
> |test_graph_perf_sink-0 |10175642 |2604964352 |1 |256.000 |2015.513600 |27.0000 |
> +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
>
> New:
> +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
> |Node |calls |objs |realloc_count |objs/call |objs/sec(10E6) |cycles/call|
> +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
> |test_graph_perf_worker-0-0 |10154953 |2599667968 |1 |256.000 |2010.960128 |27.0000 |
> |test_graph_perf_worker-1-0 |10155316 |2599760896 |1 |256.000 |2011.053056 |27.0000 |
> |test_graph_perf_worker-2-0 |10155338 |2599766528 |1 |256.000 |2011.058688 |28.0000 |
> |test_graph_perf_worker-3-0 |10155357 |2599771392 |1 |256.000 |2011.063552 |28.0000 |
> |test_graph_perf_source-0 |10155394 |2599780864 |2 |256.000 |2011.072768 |27.0000 |
> |test_graph_perf_sink-0 |10155422 |2599788032 |1 |256.000 |2011.080192 |27.0000 |
> +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
>
> > This may not be needed for l3fwd
> >
> Do you mean graph->model?
Yes.
>
> > or if there is not much code duplication,
> >
> > Do the following remove the limitation,
> > #define RTE_GRAPH_MODEL_SELECT RTE_GRAPH_MODEL_RTC.
> >
> > graph_main_loop change to graph_main_rtc_loop
> >
> > #define RTE_GRAPH_MODEL_SELECT GRAPH_MODEL_MCORE_DISPATCH
> >
> > graph_main_loop change to graph_main_mcore_loop
> >
> > Select the following based on runtime option
> > /* Launch per-lcore init on every worker lcore */
> > rte_eal_mp_remote_launch(graph_main_rtc_loop, NULL, SKIP_MAIN); or
> > rte_eal_mp_remote_launch(graph_main_mcore_loop, NULL, SKIP_MAIN);
> >
>
> We want to 1. Use same API (rte_graph_walk()) for diff models.
> 2. no performance drop for rtc (use RTE_GRAPH_MODEL_SELECT in compile time)
>
> If I understand correctly, I need remove graph->model and only use
> RTE_GRAPH_MODEL_SELECT to select models?
>
> And change it as
> graph_main_rtc_loop()
> {
> While(1)
> rte_graph_walk_rtc()
> }
>
> But actually, I think graph->model is need, especially for config stage and for runtime config
> If set RTE_GRAPH_MODEL_SELECT_RUNTIME.
Yes. Agree. If there is no MAJOR performance issues lets use
RTE_GRAPH_MODEL_SELECT_RUNTIME for l3fwd.
> We need the model type to decide to alloc workqueue and use RTE_GRAPH_MODEL_SELECT
> to choose the walk.
>
>
> > > memset(&rewrite_data, 0, sizeof(rewrite_data));
> > > rewrite_len = sizeof(rewrite_data); diff --git
> > > a/lib/graph/rte_graph_worker.h b/lib/graph/rte_graph_worker.h index
> > > 541c373cb1..19b4c1514f 100644
> > > --- a/lib/graph/rte_graph_worker.h
> > > +++ b/lib/graph/rte_graph_worker.h
> > > @@ -26,6 +26,9 @@ __rte_experimental
> > > static inline void
> > > rte_graph_walk(struct rte_graph *graph) {
> > > +#define RTE_GRAPH_MODEL_RTC 0
> > > +#define RTE_GRAPH_MODEL_MCORE_DISPATCH 1
> >
> > No need for duplicate enum. Please remove enum make this as in public header
> > file.
> >
> Yes, it will cause no defined warnings.
> Thanks for your comments.
> I will remove enum and define model type macros in public header. And also change
> the related structs/APIs.
Also add a comment in RTE_GRAPH_MODEL_MCORE_DISPATCH, If adding
new entry, then update graph_is_valid API.
>
> >
> > > +
> >
> > Add comment here, On how application uses this, aka. before inlcuding the
> > worker header file #define RTE_GRAPH_MODEL_SELECT
> > RTE_GRAPH_MODEL_RTC.
> > Please change the text as needed.
> Yes, I will add comment and add the usage in document.
>
> >
> >
> > > #if !defined(RTE_GRAPH_MODEL_SELECT) || RTE_GRAPH_MODEL_SELECT ==
> > RTE_GRAPH_MODEL_RTC
> > > rte_graph_walk_rtc(graph);
> > > #elif RTE_GRAPH_MODEL_SELECT ==
> > RTE_GRAPH_MODEL_MCORE_DISPATCH
> > > --
> > > 2.37.2
> > >
> -----Original Message-----
> From: Jerin Jacob <jerinjacobk@gmail.com>
> Sent: Tuesday, June 6, 2023 1:55 PM
> To: Yan, Zhirun <zhirun.yan@intel.com>
> Cc: dev@dpdk.org; jerinj@marvell.com; kirankumark@marvell.com;
> ndabilpuram@marvell.com; stephen@networkplumber.org;
> pbhagavatula@marvell.com; Liang, Cunming <cunming.liang@intel.com>; Wang,
> Haiyue <haiyue.wang@intel.com>; mattias.ronnblom
> <mattias.ronnblom@ericsson.com>
> Subject: Re: [PATCH v7 15/17] examples/l3fwd-graph: introduce multicore
> dispatch worker model
>
> On Tue, Jun 6, 2023 at 10:41 AM Yan, Zhirun <zhirun.yan@intel.com> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: Jerin Jacob <jerinjacobk@gmail.com>
> > > Sent: Monday, June 5, 2023 9:42 PM
> > > To: Yan, Zhirun <zhirun.yan@intel.com>
> > > Cc: dev@dpdk.org; jerinj@marvell.com; kirankumark@marvell.com;
> > > ndabilpuram@marvell.com; stephen@networkplumber.org;
> > > pbhagavatula@marvell.com; Liang, Cunming <cunming.liang@intel.com>;
> > > Wang, Haiyue <haiyue.wang@intel.com>; mattias.ronnblom
> > > <mattias.ronnblom@ericsson.com>
> > > Subject: Re: [PATCH v7 15/17] examples/l3fwd-graph: introduce
> > > multicore dispatch worker model
> > >
> > > On Mon, Jun 5, 2023 at 4:57 PM Zhirun Yan <zhirun.yan@intel.com> wrote:
> > > >
> > > > Add new parameter "model" to choose mcore dispatch or rtc model.
> > > > And in dispatch model, the node will affinity to worker core successively.
> > > >
> > > > Note:
> > > > RTE_GRAPH_MODEL_SELECT is set to GRAPH_MODEL_RTC by default.
> Must
> > > set
> > > > model the same as RTE_GRAPH_MODEL_SELECT If set it as rtc or mcore
> > > > dispatch explicitly. GRAPH_MODEL_MCORE_RUNTIME_SELECT means it
> > > > could choose by model in runtime.
> > > > Only support one RX node for mcore dispatch model in current
> > > > implementation.
> > > >
> > > > ./dpdk-l3fwd-graph -l 8,9,10,11 -n 4 -- -p 0x1 --config="(0,0,9)"
> > > > -P --model="dispatch"
> > > >
> > > > Signed-off-by: Haiyue Wang <haiyue.wang@intel.com>
> > > > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > > > Signed-off-by: Zhirun Yan <zhirun.yan@intel.com>
> > > > ---
> > > > examples/l3fwd-graph/main.c | 231 +++++++++++++++++++++++++++++--
> ----
> > > > lib/graph/rte_graph_worker.h | 3 +
> > > > 2 files changed, 196 insertions(+), 38 deletions(-)
> > > >
> > > > diff --git a/examples/l3fwd-graph/main.c
> > > > b/examples/l3fwd-graph/main.c index 5feeab4f0f..4ecc6c9af4 100644
> > > > --- a/examples/l3fwd-graph/main.c
> > > > +++ b/examples/l3fwd-graph/main.c
> > > > @@ -23,6 +23,12 @@
> > > > #include <rte_cycles.h>
> > > > #include <rte_eal.h>
> > > > #include <rte_ethdev.h>
> > > > +#define GRAPH_MODEL_RTC 0 /* Run-to-completion model, set by
> default.
> > > > +*/ #define GRAPH_MODEL_MCORE_DISPATCH 1 /* Dispatch model. */
> > > #define
> > > > +GRAPH_MODEL_MCORE_RUNTIME_SELECT 2 /* Support to select model
> by
> > > */
> > > > + /* parsing model in
> > > > +cmdline. */
> > >
> > > After moving model to graph->model, Can you check the performance.
> >
> > In my env, I test l3fwd-graph, I got the same throughput.(slight
> > improve could be treated as jitter) For graph_perf_autotest in test
> > app, there is slight drop (About 0.2% call, similar cycles/call) Can it be treated
> as jitter?
>
> Most likely.
> Try in following in fasth path.
> const ... model = graph->model;
>
By default we set RTE_GRAPH_MODEL_SELECT == RTE_GRAPH_MODEL_RTC,
So get model is only in stats, not in fast path.
I found the root cause is coming from the additional unit test.
But actually, it should not be called. All added UT is under graph_autotest, not in graph_perf_autotest.
It's strange if I destroy the cloned graph in testcase in the additional UT, can get same stats as we expected(even
better performance). A little more calls, objs and better cycles/call (28->27).
New:
+-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
|Node |calls |objs |realloc_count |objs/call |objs/sec(10E6) |cycles/call|
+-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
|test_graph_perf_worker-0-0 |10286353 |2633306368 |1 |256.000 |2037.676032 |27.0000 |
|test_graph_perf_worker-1-0 |10286709 |2633397504 |1 |256.000 |2037.767168 |27.0000 |
|test_graph_perf_worker-2-0 |10286732 |2633403392 |1 |256.000 |2037.773056 |27.0000 |
|test_graph_perf_worker-3-0 |10286751 |2633408256 |1 |256.000 |2037.777920 |27.0000 |
|test_graph_perf_source-0 |10286774 |2633414144 |2 |256.000 |2037.783552 |27.0000 |
|test_graph_perf_sink-0 |10286791 |2633418496 |1 |256.000 |2037.788160 |27.0000 |
+-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
> >
> > Old:
> > +-------------------------------+---------------+---------------+---------------+------------
> ---+---------------+-----------+
> > |Node |calls |objs |realloc_count |objs/call
> |objs/sec(10E6) |cycles/call|
> > +-------------------------------+---------------+---------------+---------------+------------
> ---+---------------+-----------+
> > |test_graph_perf_worker-0-0 |10175176 |2604845056 |1
> |256.000 |2015.394304 |27.0000 |
> > |test_graph_perf_worker-1-0 |10175542 |2604938752 |1
> |256.000 |2015.488000 |28.0000 |
> > |test_graph_perf_worker-2-0 |10175565 |2604944640 |1
> |256.000 |2015.493888 |28.0000 |
> > |test_graph_perf_worker-3-0 |10175593 |2604951808 |1
> |256.000 |2015.501056 |27.0000 |
> > |test_graph_perf_source-0 |10175623 |2604959488 |2
> |256.000 |2015.508480 |27.0000 |
> > |test_graph_perf_sink-0 |10175642 |2604964352 |1
> |256.000 |2015.513600 |27.0000 |
> > +-------------------------------+---------------+---------------+---------------+------------
> ---+---------------+-----------+
> >
> > New:
> > +-------------------------------+---------------+---------------+---------------+------------
> ---+---------------+-----------+
> > |Node |calls |objs |realloc_count |objs/call
> |objs/sec(10E6) |cycles/call|
> > +-------------------------------+---------------+---------------+---------------+------------
> ---+---------------+-----------+
> > |test_graph_perf_worker-0-0 |10154953 |2599667968 |1
> |256.000 |2010.960128 |27.0000 |
> > |test_graph_perf_worker-1-0 |10155316 |2599760896 |1
> |256.000 |2011.053056 |27.0000 |
> > |test_graph_perf_worker-2-0 |10155338 |2599766528 |1
> |256.000 |2011.058688 |28.0000 |
> > |test_graph_perf_worker-3-0 |10155357 |2599771392 |1
> |256.000 |2011.063552 |28.0000 |
> > |test_graph_perf_source-0 |10155394 |2599780864 |2
> |256.000 |2011.072768 |27.0000 |
> > |test_graph_perf_sink-0 |10155422 |2599788032 |1
> |256.000 |2011.080192 |27.0000 |
> > +-------------------------------+---------------+---------------+---------------+------------
> ---+---------------+-----------+
> >
> > > This may not be needed for l3fwd
> > >
> > Do you mean graph->model?
>
> Yes.
>
> >
> > > or if there is not much code duplication,
> > >
> > > Do the following remove the limitation, #define
> > > RTE_GRAPH_MODEL_SELECT RTE_GRAPH_MODEL_RTC.
> > >
> > > graph_main_loop change to graph_main_rtc_loop
> > >
> > > #define RTE_GRAPH_MODEL_SELECT GRAPH_MODEL_MCORE_DISPATCH
> > >
> > > graph_main_loop change to graph_main_mcore_loop
> > >
> > > Select the following based on runtime option
> > > /* Launch per-lcore init on every worker lcore */
> > > rte_eal_mp_remote_launch(graph_main_rtc_loop, NULL, SKIP_MAIN);
> or
> > > rte_eal_mp_remote_launch(graph_main_mcore_loop, NULL,
> > > SKIP_MAIN);
> > >
> >
> > We want to 1. Use same API (rte_graph_walk()) for diff models.
> > 2. no performance drop for rtc (use RTE_GRAPH_MODEL_SELECT in compile
> > time)
> >
> > If I understand correctly, I need remove graph->model and only use
> > RTE_GRAPH_MODEL_SELECT to select models?
> >
> > And change it as
> > graph_main_rtc_loop()
> > {
> > While(1)
> > rte_graph_walk_rtc()
> > }
> >
> > But actually, I think graph->model is need, especially for config
> > stage and for runtime config If set RTE_GRAPH_MODEL_SELECT_RUNTIME.
>
> Yes. Agree. If there is no MAJOR performance issues lets use
> RTE_GRAPH_MODEL_SELECT_RUNTIME for l3fwd.
>
Thanks, there is no major performance issues. I will keep to use current
scheme.
> > We need the model type to decide to alloc workqueue and use
> > RTE_GRAPH_MODEL_SELECT to choose the walk.
> >
> >
> > > > memset(&rewrite_data, 0, sizeof(rewrite_data));
> > > > rewrite_len = sizeof(rewrite_data); diff --git
> > > > a/lib/graph/rte_graph_worker.h b/lib/graph/rte_graph_worker.h
> > > > index 541c373cb1..19b4c1514f 100644
> > > > --- a/lib/graph/rte_graph_worker.h
> > > > +++ b/lib/graph/rte_graph_worker.h
> > > > @@ -26,6 +26,9 @@ __rte_experimental static inline void
> > > > rte_graph_walk(struct rte_graph *graph) {
> > > > +#define RTE_GRAPH_MODEL_RTC 0
> > > > +#define RTE_GRAPH_MODEL_MCORE_DISPATCH 1
> > >
> > > No need for duplicate enum. Please remove enum make this as in
> > > public header file.
> > >
> > Yes, it will cause no defined warnings.
> > Thanks for your comments.
> > I will remove enum and define model type macros in public header. And
> > also change the related structs/APIs.
>
> Also add a comment in RTE_GRAPH_MODEL_MCORE_DISPATCH, If adding new
> entry, then update graph_is_valid API.
>
Got it. Thanks very much.
> >
> > >
> > > > +
> > >
> > > Add comment here, On how application uses this, aka. before
> > > inlcuding the worker header file #define RTE_GRAPH_MODEL_SELECT
> > > RTE_GRAPH_MODEL_RTC.
> > > Please change the text as needed.
> > Yes, I will add comment and add the usage in document.
> >
> > >
> > >
> > > > #if !defined(RTE_GRAPH_MODEL_SELECT) ||
> RTE_GRAPH_MODEL_SELECT ==
> > > RTE_GRAPH_MODEL_RTC
> > > > rte_graph_walk_rtc(graph); #elif RTE_GRAPH_MODEL_SELECT
> > > > ==
> > > RTE_GRAPH_MODEL_MCORE_DISPATCH
> > > > --
> > > > 2.37.2
> > > >
@@ -23,6 +23,12 @@
#include <rte_cycles.h>
#include <rte_eal.h>
#include <rte_ethdev.h>
+#define GRAPH_MODEL_RTC 0 /* Run-to-completion model, set by default. */
+#define GRAPH_MODEL_MCORE_DISPATCH 1 /* Dispatch model. */
+#define GRAPH_MODEL_MCORE_RUNTIME_SELECT 2 /* Support to select model by */
+ /* parsing model in cmdline. */
+#undef RTE_GRAPH_MODEL_SELECT
+#define RTE_GRAPH_MODEL_SELECT GRAPH_MODEL_RTC
#include <rte_graph_worker.h>
#include <rte_launch.h>
#include <rte_lcore.h>
@@ -55,6 +61,9 @@
#define NB_SOCKETS 8
+/* Graph module */
+#define WORKER_MODEL_RTC "rtc"
+#define WORKER_MODEL_MCORE_DISPATCH "dispatch"
/* Static global variables used within this file. */
static uint16_t nb_rxd = RX_DESC_DEFAULT;
static uint16_t nb_txd = TX_DESC_DEFAULT;
@@ -88,6 +97,8 @@ struct lcore_rx_queue {
char node_name[RTE_NODE_NAMESIZE];
};
+static enum rte_graph_worker_model model_conf = RTE_GRAPH_MODEL_DEFAULT;
+
/* Lcore conf */
struct lcore_conf {
uint16_t n_rx_queue;
@@ -153,6 +164,19 @@ static struct ipv4_l3fwd_lpm_route ipv4_l3fwd_lpm_route_array[] = {
{RTE_IPV4(198, 18, 6, 0), 24, 6}, {RTE_IPV4(198, 18, 7, 0), 24, 7},
};
+static int
+check_worker_model_params(void)
+{
+ if (model_conf == RTE_GRAPH_MODEL_MCORE_DISPATCH &&
+ nb_lcore_params > 1) {
+ printf("Exceeded max number of lcore params for remote model: %hu\n",
+ nb_lcore_params);
+ return -1;
+ }
+
+ return 0;
+}
+
static int
check_lcore_params(void)
{
@@ -276,6 +300,7 @@ print_usage(const char *prgname)
" --eth-dest=X,MM:MM:MM:MM:MM:MM: Ethernet destination for "
"port X\n"
" --max-pkt-len PKTLEN: maximum packet length in decimal (64-9600)\n"
+ " --model NAME: walking model name, dispatch or rtc(by default)\n"
" --no-numa: Disable numa awareness\n"
" --per-port-pool: Use separate buffer pool per port\n"
" --pcap-enable: Enables pcap capture\n"
@@ -318,6 +343,19 @@ parse_max_pkt_len(const char *pktlen)
return len;
}
+static void
+parse_worker_model(const char *model)
+{
+ if (strcmp(model, WORKER_MODEL_MCORE_DISPATCH) == 0)
+ model_conf = RTE_GRAPH_MODEL_MCORE_DISPATCH;
+ else if (strcmp(model, WORKER_MODEL_RTC) == 0)
+ model_conf = RTE_GRAPH_MODEL_RTC;
+
+ if (model_conf != RTE_GRAPH_MODEL_SELECT &&
+ RTE_GRAPH_MODEL_SELECT <= RTE_GRAPH_MODEL_MCORE_DISPATCH)
+ rte_exit(EXIT_FAILURE, "Invalid worker model: %s", model);
+}
+
static int
parse_portmask(const char *portmask)
{
@@ -434,6 +472,8 @@ static const char short_options[] = "p:" /* portmask */
#define CMD_LINE_OPT_PCAP_ENABLE "pcap-enable"
#define CMD_LINE_OPT_NUM_PKT_CAP "pcap-num-cap"
#define CMD_LINE_OPT_PCAP_FILENAME "pcap-file-name"
+#define CMD_LINE_OPT_WORKER_MODEL "model"
+
enum {
/* Long options mapped to a short option */
@@ -449,6 +489,7 @@ enum {
CMD_LINE_OPT_PARSE_PCAP_ENABLE,
CMD_LINE_OPT_PARSE_NUM_PKT_CAP,
CMD_LINE_OPT_PCAP_FILENAME_CAP,
+ CMD_LINE_OPT_WORKER_MODEL_TYPE,
};
static const struct option lgopts[] = {
@@ -460,6 +501,7 @@ static const struct option lgopts[] = {
{CMD_LINE_OPT_PCAP_ENABLE, 0, 0, CMD_LINE_OPT_PARSE_PCAP_ENABLE},
{CMD_LINE_OPT_NUM_PKT_CAP, 1, 0, CMD_LINE_OPT_PARSE_NUM_PKT_CAP},
{CMD_LINE_OPT_PCAP_FILENAME, 1, 0, CMD_LINE_OPT_PCAP_FILENAME_CAP},
+ {CMD_LINE_OPT_WORKER_MODEL, 1, 0, CMD_LINE_OPT_WORKER_MODEL_TYPE},
{NULL, 0, 0, 0},
};
@@ -551,6 +593,11 @@ parse_args(int argc, char **argv)
printf("Pcap file name: %s\n", pcap_filename);
break;
+ case CMD_LINE_OPT_WORKER_MODEL_TYPE:
+ printf("Use new worker model: %s\n", optarg);
+ parse_worker_model(optarg);
+ break;
+
default:
print_usage(prgname);
return -1;
@@ -788,6 +835,142 @@ config_port_max_pkt_len(struct rte_eth_conf *conf,
return 0;
}
+static void
+graph_config_mcore_dispatch(struct rte_graph_param graph_conf)
+{
+ uint16_t nb_patterns = graph_conf.nb_node_patterns;
+ int worker_count = rte_lcore_count() - 1;
+ int main_lcore_id = rte_get_main_lcore();
+ rte_graph_t main_graph_id = 0;
+ struct rte_node *node_tmp;
+ struct lcore_conf *qconf;
+ struct rte_graph *graph;
+ rte_graph_t graph_id;
+ rte_graph_off_t off;
+ int n_rx_node = 0;
+ int worker_lcore;
+ rte_node_t count;
+ int i, j;
+ int ret;
+
+ for (j = 0; j < nb_lcore_params; j++) {
+ qconf = &lcore_conf[lcore_params[j].lcore_id];
+ /* Add rx node patterns of all lcore */
+ for (i = 0; i < qconf->n_rx_queue; i++) {
+ char *node_name = qconf->rx_queue_list[i].node_name;
+ unsigned int lcore_id = lcore_params[j].lcore_id;
+
+ graph_conf.node_patterns[nb_patterns + n_rx_node + i] = node_name;
+ n_rx_node++;
+ ret = rte_graph_model_mcore_dispatch_node_lcore_affinity_set(node_name,
+ lcore_id);
+ if (ret == 0)
+ printf("Set node %s affinity to lcore %u\n", node_name,
+ lcore_params[j].lcore_id);
+ }
+ }
+
+ graph_conf.nb_node_patterns = nb_patterns + n_rx_node;
+ graph_conf.socket_id = rte_lcore_to_socket_id(main_lcore_id);
+
+ qconf = &lcore_conf[main_lcore_id];
+ snprintf(qconf->name, sizeof(qconf->name), "worker_%u",
+ main_lcore_id);
+
+ /* create main graph */
+ main_graph_id = rte_graph_create(qconf->name, &graph_conf);
+ if (main_graph_id == RTE_GRAPH_ID_INVALID)
+ rte_exit(EXIT_FAILURE,
+ "rte_graph_create(): main_graph_id invalid for lcore %u\n",
+ main_lcore_id);
+
+ /* set the graph model for the main graph */
+ rte_graph_worker_model_set(RTE_GRAPH_MODEL_MCORE_DISPATCH);
+ qconf->graph_id = main_graph_id;
+ qconf->graph = rte_graph_lookup(qconf->name);
+ if (!qconf->graph)
+ rte_exit(EXIT_FAILURE,
+ "rte_graph_lookup(): graph %s not found\n",
+ qconf->name);
+
+ graph = qconf->graph;
+ worker_lcore = lcore_params[nb_lcore_params - 1].lcore_id;
+ rte_graph_foreach_node(count, off, graph, node_tmp) {
+ /* Need to set the node Lcore affinity before clone graph for each lcore */
+ if (node_tmp->dispatch.lcore_id == RTE_MAX_LCORE) {
+ worker_lcore = rte_get_next_lcore(worker_lcore, true, 1);
+ ret = rte_graph_model_mcore_dispatch_node_lcore_affinity_set(node_tmp->name,
+ worker_lcore);
+ if (ret == 0)
+ printf("Set node %s affinity to lcore %u\n",
+ node_tmp->name, worker_lcore);
+ }
+ }
+
+ worker_lcore = main_lcore_id;
+ for (i = 0; i < worker_count; i++) {
+ worker_lcore = rte_get_next_lcore(worker_lcore, true, 1);
+
+ qconf = &lcore_conf[worker_lcore];
+ snprintf(qconf->name, sizeof(qconf->name), "cloned-%u", worker_lcore);
+ graph_id = rte_graph_clone(main_graph_id, qconf->name, &graph_conf);
+ ret = rte_graph_model_mcore_dispatch_core_bind(graph_id, worker_lcore);
+ if (ret == 0)
+ printf("bind graph %d to lcore %u\n", graph_id, worker_lcore);
+
+ /* full cloned graph name */
+ snprintf(qconf->name, sizeof(qconf->name), "%s",
+ rte_graph_id_to_name(graph_id));
+ qconf->graph_id = graph_id;
+ qconf->graph = rte_graph_lookup(qconf->name);
+ if (!qconf->graph)
+ rte_exit(EXIT_FAILURE,
+ "Failed to lookup graph %s\n",
+ qconf->name);
+ continue;
+ }
+}
+
+static void
+graph_config_rtc(struct rte_graph_param graph_conf)
+{
+ uint16_t nb_patterns = graph_conf.nb_node_patterns;
+ struct lcore_conf *qconf;
+ rte_graph_t graph_id;
+ uint32_t lcore_id;
+ rte_edge_t i;
+
+ for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+ if (rte_lcore_is_enabled(lcore_id) == 0)
+ continue;
+
+ qconf = &lcore_conf[lcore_id];
+ /* Skip graph creation if no source exists */
+ if (!qconf->n_rx_queue)
+ continue;
+ /* Add rx node patterns of this lcore */
+ for (i = 0; i < qconf->n_rx_queue; i++) {
+ graph_conf.node_patterns[nb_patterns + i] =
+ qconf->rx_queue_list[i].node_name;
+ }
+ graph_conf.nb_node_patterns = nb_patterns + i;
+ graph_conf.socket_id = rte_lcore_to_socket_id(lcore_id);
+ snprintf(qconf->name, sizeof(qconf->name), "worker_%u",
+ lcore_id);
+ graph_id = rte_graph_create(qconf->name, &graph_conf);
+ if (graph_id == RTE_GRAPH_ID_INVALID)
+ rte_exit(EXIT_FAILURE,
+ "rte_graph_create(): graph_id invalid for lcore %u\n",
+ lcore_id);
+ qconf->graph_id = graph_id;
+ qconf->graph = rte_graph_lookup(qconf->name);
+ if (!qconf->graph)
+ rte_exit(EXIT_FAILURE,
+ "rte_graph_lookup(): graph %s not found\n",
+ qconf->name);
+ }
+}
+
int
main(int argc, char **argv)
{
@@ -840,6 +1023,9 @@ main(int argc, char **argv)
if (check_lcore_params() < 0)
rte_exit(EXIT_FAILURE, "check_lcore_params() failed\n");
+ if (check_worker_model_params() < 0)
+ rte_exit(EXIT_FAILURE, "check_worker_model_params() failed\n");
+
ret = init_lcore_rx_queues();
if (ret < 0)
rte_exit(EXIT_FAILURE, "init_lcore_rx_queues() failed\n");
@@ -1079,51 +1265,20 @@ main(int argc, char **argv)
memset(&graph_conf, 0, sizeof(graph_conf));
graph_conf.node_patterns = node_patterns;
+ graph_conf.nb_node_patterns = nb_patterns;
/* Pcap config */
graph_conf.pcap_enable = pcap_trace_enable;
graph_conf.num_pkt_to_capture = packet_to_capture;
graph_conf.pcap_filename = pcap_filename;
- for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
- rte_graph_t graph_id;
- rte_edge_t i;
-
- if (rte_lcore_is_enabled(lcore_id) == 0)
- continue;
-
- qconf = &lcore_conf[lcore_id];
-
- /* Skip graph creation if no source exists */
- if (!qconf->n_rx_queue)
- continue;
-
- /* Add rx node patterns of this lcore */
- for (i = 0; i < qconf->n_rx_queue; i++) {
- graph_conf.node_patterns[nb_patterns + i] =
- qconf->rx_queue_list[i].node_name;
- }
-
- graph_conf.nb_node_patterns = nb_patterns + i;
- graph_conf.socket_id = rte_lcore_to_socket_id(lcore_id);
-
- snprintf(qconf->name, sizeof(qconf->name), "worker_%u",
- lcore_id);
-
- graph_id = rte_graph_create(qconf->name, &graph_conf);
- if (graph_id == RTE_GRAPH_ID_INVALID)
- rte_exit(EXIT_FAILURE,
- "rte_graph_create(): graph_id invalid"
- " for lcore %u\n", lcore_id);
+ if (model_conf == RTE_GRAPH_MODEL_MCORE_DISPATCH)
+ graph_config_mcore_dispatch(graph_conf);
+ else
+ graph_config_rtc(graph_conf);
- qconf->graph_id = graph_id;
- qconf->graph = rte_graph_lookup(qconf->name);
- /* >8 End of graph initialization. */
- if (!qconf->graph)
- rte_exit(EXIT_FAILURE,
- "rte_graph_lookup(): graph %s not found\n",
- qconf->name);
- }
+ rte_graph_worker_model_set(model_conf);
+ /* >8 End of graph initialization. */
memset(&rewrite_data, 0, sizeof(rewrite_data));
rewrite_len = sizeof(rewrite_data);
@@ -26,6 +26,9 @@ __rte_experimental
static inline void
rte_graph_walk(struct rte_graph *graph)
{
+#define RTE_GRAPH_MODEL_RTC 0
+#define RTE_GRAPH_MODEL_MCORE_DISPATCH 1
+
#if !defined(RTE_GRAPH_MODEL_SELECT) || RTE_GRAPH_MODEL_SELECT == RTE_GRAPH_MODEL_RTC
rte_graph_walk_rtc(graph);
#elif RTE_GRAPH_MODEL_SELECT == RTE_GRAPH_MODEL_MCORE_DISPATCH