[RFC,v1,4/6] graph: enhance graph walk by cross-core dispatch

Message ID 20220908020959.1675953-5-zhirun.yan@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series graph enhancement for multi-core dispatch |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Yan, Zhirun Sept. 8, 2022, 2:09 a.m. UTC
  This patch enhance the task scheduler mechanism to enable dispatching
tasks to another worker cores. Currently, there is only a local work
queue for one graph to walk. We introduce a scheduler worker queue in
each worker core for dispatching tasks. It will perform the walk on
scheduler work queue first, then handle the local work queue.

Signed-off-by: Haiyue Wang <haiyue.wang@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
Signed-off-by: Zhirun Yan <zhirun.yan@intel.com>
---
 lib/graph/graph.c            |  6 ++++++
 lib/graph/rte_graph_worker.h | 11 +++++++++++
 2 files changed, 17 insertions(+)
  

Comments

Pavan Nikhilesh Bhagavatula Sept. 8, 2022, 5:27 a.m. UTC | #1
> This patch enhance the task scheduler mechanism to enable dispatching
> tasks to another worker cores. Currently, there is only a local work
> queue for one graph to walk. We introduce a scheduler worker queue in
> each worker core for dispatching tasks. It will perform the walk on
> scheduler work queue first, then handle the local work queue.
> 
> Signed-off-by: Haiyue Wang <haiyue.wang@intel.com>
> Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> Signed-off-by: Zhirun Yan <zhirun.yan@intel.com>
> ---
>  lib/graph/graph.c            |  6 ++++++
>  lib/graph/rte_graph_worker.h | 11 +++++++++++
>  2 files changed, 17 insertions(+)
> 
> diff --git a/lib/graph/graph.c b/lib/graph/graph.c
> index b4eb18175a..49ea2b3fbb 100644
> --- a/lib/graph/graph.c
> +++ b/lib/graph/graph.c
> @@ -368,6 +368,8 @@ rte_graph_destroy(rte_graph_t id)
>  	while (graph != NULL) {
>  		tmp = STAILQ_NEXT(graph, next);
>  		if (graph->id == id) {
> +			/* Destroy the schedule work queue if has */
> +			graph_sched_wq_destroy(graph);
>  			/* Call fini() of the all the nodes in the graph */
>  			graph_node_fini(graph);
>  			/* Destroy graph fast path memory */
> @@ -470,6 +472,10 @@ graph_clone(struct graph *parent_graph, const char
> *name,
>  	if (graph_node_init(graph))
>  		goto graph_mem_destroy;
> 
> +	/* Create the graph schedule work queue */
> +	if (graph_sched_wq_create(graph, parent_graph))
> +		goto graph_mem_destroy;
> +
>  	/* All good, Lets add the graph to the list */
>  	graph_id++;
>  	STAILQ_INSERT_TAIL(&graph_list, graph, next);
> diff --git a/lib/graph/rte_graph_worker.h b/lib/graph/rte_graph_worker.h
> index faf3f31ddc..e98697d880 100644
> --- a/lib/graph/rte_graph_worker.h
> +++ b/lib/graph/rte_graph_worker.h
> @@ -177,6 +177,7 @@ static inline void
>  rte_graph_walk(struct rte_graph *graph)
>  {
>  	const rte_graph_off_t *cir_start = graph->cir_start;
> +	const unsigned int lcore_id = graph->lcore_id;
>  	const rte_node_t mask = graph->cir_mask;
>  	uint32_t head = graph->head;
>  	struct rte_node *node;
> @@ -184,6 +185,9 @@ rte_graph_walk(struct rte_graph *graph)
>  	uint16_t rc;
>  	void **objs;
> 
> +	if (graph->wq != NULL)
> +		__rte_graph_sched_wq_process(graph);
> +


We should introduce a flags field in rte_graph_param which can
be used by the application to define whether a graph should support
multi-core dispatch.

Then we can make `__rte_graph_sched_wq_process` as node 0 during graph 
creation so that it will be always called at the start of graph processing followed 
by calling rest of the nodes.
This will remove unnecessary branches in fastpath.

>  	/*
>  	 * Walk on the source node(s) ((cir_start - head) -> cir_start) and
> then
>  	 * on the pending streams (cir_start -> (cir_start + mask) -> cir_start)
> @@ -205,6 +209,12 @@ rte_graph_walk(struct rte_graph *graph)
>  		objs = node->objs;
>  		rte_prefetch0(objs);
> 
> +		/* Schedule the node until all task/objs are done */
> +		if (node->lcore_id != RTE_MAX_LCORE && (int32_t)head > 0
> &&
> +		    lcore_id != node->lcore_id && graph->rq != NULL &&
> +		    __rte_graph_sched_node_enqueue(node, graph->rq))
> +			goto next;
> +
>  		if (rte_graph_has_stats_feature()) {
>  			start = rte_rdtsc();
>  			rc = node->process(graph, node, objs, node->idx);
> @@ -215,6 +225,7 @@ rte_graph_walk(struct rte_graph *graph)
>  			node->process(graph, node, objs, node->idx);
>  		}
>  		node->idx = 0;
> +	next:
>  		head = likely((int32_t)head > 0) ? head & mask : head;
>  	}
>  	graph->tail = 0;
> --
> 2.25.1
  
Yan, Zhirun Sept. 15, 2022, 1:52 a.m. UTC | #2
> -----Original Message-----
> From: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>
> Sent: Thursday, September 8, 2022 1:27 PM
> To: Yan, Zhirun <zhirun.yan@intel.com>; dev@dpdk.org; Jerin Jacob
> Kollanukkaran <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
> <kirankumark@marvell.com>
> Cc: Liang, Cunming <cunming.liang@intel.com>; Wang, Haiyue
> <haiyue.wang@intel.com>
> Subject: RE: [EXT] [RFC, v1 4/6] graph: enhance graph walk by cross-core
> dispatch
> 
> > This patch enhance the task scheduler mechanism to enable dispatching
> > tasks to another worker cores. Currently, there is only a local work
> > queue for one graph to walk. We introduce a scheduler worker queue in
> > each worker core for dispatching tasks. It will perform the walk on
> > scheduler work queue first, then handle the local work queue.
> >
> > Signed-off-by: Haiyue Wang <haiyue.wang@intel.com>
> > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > Signed-off-by: Zhirun Yan <zhirun.yan@intel.com>
> > ---
> >  lib/graph/graph.c            |  6 ++++++
> >  lib/graph/rte_graph_worker.h | 11 +++++++++++
> >  2 files changed, 17 insertions(+)
> >
> > diff --git a/lib/graph/graph.c b/lib/graph/graph.c index
> > b4eb18175a..49ea2b3fbb 100644
> > --- a/lib/graph/graph.c
> > +++ b/lib/graph/graph.c
> > @@ -368,6 +368,8 @@ rte_graph_destroy(rte_graph_t id)
> >  	while (graph != NULL) {
> >  		tmp = STAILQ_NEXT(graph, next);
> >  		if (graph->id == id) {
> > +			/* Destroy the schedule work queue if has */
> > +			graph_sched_wq_destroy(graph);
> >  			/* Call fini() of the all the nodes in the graph */
> >  			graph_node_fini(graph);
> >  			/* Destroy graph fast path memory */ @@ -470,6 +472,10
> @@
> > graph_clone(struct graph *parent_graph, const char *name,
> >  	if (graph_node_init(graph))
> >  		goto graph_mem_destroy;
> >
> > +	/* Create the graph schedule work queue */
> > +	if (graph_sched_wq_create(graph, parent_graph))
> > +		goto graph_mem_destroy;
> > +
> >  	/* All good, Lets add the graph to the list */
> >  	graph_id++;
> >  	STAILQ_INSERT_TAIL(&graph_list, graph, next); diff --git
> > a/lib/graph/rte_graph_worker.h b/lib/graph/rte_graph_worker.h index
> > faf3f31ddc..e98697d880 100644
> > --- a/lib/graph/rte_graph_worker.h
> > +++ b/lib/graph/rte_graph_worker.h
> > @@ -177,6 +177,7 @@ static inline void  rte_graph_walk(struct
> > rte_graph *graph)  {
> >  	const rte_graph_off_t *cir_start = graph->cir_start;
> > +	const unsigned int lcore_id = graph->lcore_id;
> >  	const rte_node_t mask = graph->cir_mask;
> >  	uint32_t head = graph->head;
> >  	struct rte_node *node;
> > @@ -184,6 +185,9 @@ rte_graph_walk(struct rte_graph *graph)
> >  	uint16_t rc;
> >  	void **objs;
> >
> > +	if (graph->wq != NULL)
> > +		__rte_graph_sched_wq_process(graph);
> > +
> 
> 
> We should introduce a flags field in rte_graph_param which can be used by
> the application to define whether a graph should support multi-core
> dispatch.
> 
Yes, I will add a flags field in next version.

> Then we can make `__rte_graph_sched_wq_process` as node 0 during graph
> creation so that it will be always called at the start of graph processing
> followed by calling rest of the nodes.
> This will remove unnecessary branches in fastpath.
> 

Thanks for your comments and sorry for my late reply.
Yes, we can make `__rte_graph_sched_wq_process` as node 0 with dispatch flags.
But I am not sure whether we need to register a new node here. It means we should change
the graph topo, and it will be an isolated node with no links.


> >  	/*
> >  	 * Walk on the source node(s) ((cir_start - head) -> cir_start) and
> > then
> >  	 * on the pending streams (cir_start -> (cir_start + mask) ->
> > cir_start) @@ -205,6 +209,12 @@ rte_graph_walk(struct rte_graph
> *graph)
> >  		objs = node->objs;
> >  		rte_prefetch0(objs);
> >
> > +		/* Schedule the node until all task/objs are done */
> > +		if (node->lcore_id != RTE_MAX_LCORE && (int32_t)head > 0
> > &&
> > +		    lcore_id != node->lcore_id && graph->rq != NULL &&
> > +		    __rte_graph_sched_node_enqueue(node, graph->rq))
> > +			goto next;
> > +
> >  		if (rte_graph_has_stats_feature()) {
> >  			start = rte_rdtsc();
> >  			rc = node->process(graph, node, objs, node->idx); @@ -215,6
> +225,7
> > @@ rte_graph_walk(struct rte_graph *graph)
> >  			node->process(graph, node, objs, node->idx);
> >  		}
> >  		node->idx = 0;
> > +	next:
> >  		head = likely((int32_t)head > 0) ? head & mask : head;
> >  	}
> >  	graph->tail = 0;
> > --
> > 2.25.1
  

Patch

diff --git a/lib/graph/graph.c b/lib/graph/graph.c
index b4eb18175a..49ea2b3fbb 100644
--- a/lib/graph/graph.c
+++ b/lib/graph/graph.c
@@ -368,6 +368,8 @@  rte_graph_destroy(rte_graph_t id)
 	while (graph != NULL) {
 		tmp = STAILQ_NEXT(graph, next);
 		if (graph->id == id) {
+			/* Destroy the schedule work queue if has */
+			graph_sched_wq_destroy(graph);
 			/* Call fini() of the all the nodes in the graph */
 			graph_node_fini(graph);
 			/* Destroy graph fast path memory */
@@ -470,6 +472,10 @@  graph_clone(struct graph *parent_graph, const char *name,
 	if (graph_node_init(graph))
 		goto graph_mem_destroy;
 
+	/* Create the graph schedule work queue */
+	if (graph_sched_wq_create(graph, parent_graph))
+		goto graph_mem_destroy;
+
 	/* All good, Lets add the graph to the list */
 	graph_id++;
 	STAILQ_INSERT_TAIL(&graph_list, graph, next);
diff --git a/lib/graph/rte_graph_worker.h b/lib/graph/rte_graph_worker.h
index faf3f31ddc..e98697d880 100644
--- a/lib/graph/rte_graph_worker.h
+++ b/lib/graph/rte_graph_worker.h
@@ -177,6 +177,7 @@  static inline void
 rte_graph_walk(struct rte_graph *graph)
 {
 	const rte_graph_off_t *cir_start = graph->cir_start;
+	const unsigned int lcore_id = graph->lcore_id;
 	const rte_node_t mask = graph->cir_mask;
 	uint32_t head = graph->head;
 	struct rte_node *node;
@@ -184,6 +185,9 @@  rte_graph_walk(struct rte_graph *graph)
 	uint16_t rc;
 	void **objs;
 
+	if (graph->wq != NULL)
+		__rte_graph_sched_wq_process(graph);
+
 	/*
 	 * Walk on the source node(s) ((cir_start - head) -> cir_start) and then
 	 * on the pending streams (cir_start -> (cir_start + mask) -> cir_start)
@@ -205,6 +209,12 @@  rte_graph_walk(struct rte_graph *graph)
 		objs = node->objs;
 		rte_prefetch0(objs);
 
+		/* Schedule the node until all task/objs are done */
+		if (node->lcore_id != RTE_MAX_LCORE && (int32_t)head > 0 &&
+		    lcore_id != node->lcore_id && graph->rq != NULL &&
+		    __rte_graph_sched_node_enqueue(node, graph->rq))
+			goto next;
+
 		if (rte_graph_has_stats_feature()) {
 			start = rte_rdtsc();
 			rc = node->process(graph, node, objs, node->idx);
@@ -215,6 +225,7 @@  rte_graph_walk(struct rte_graph *graph)
 			node->process(graph, node, objs, node->idx);
 		}
 		node->idx = 0;
+	next:
 		head = likely((int32_t)head > 0) ? head & mask : head;
 	}
 	graph->tail = 0;