ip_frag: add IPv4 fast fragment switch and test data

Message ID 1654161183-5391-1-git-send-email-chcchc88@163.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series ip_frag: add IPv4 fast fragment switch and test data |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/intel-Testing success Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-aarch64-unit-testing success Testing PASS
ci/github-robot: build success github build: passed
ci/iol-x86_64-compile-testing success Testing PASS
ci/iol-x86_64-unit-testing success Testing PASS
ci/iol-aarch64-compile-testing success Testing PASS
ci/iol-abi-testing success Testing PASS

Commit Message

Huichao Cai June 2, 2022, 9:13 a.m. UTC
  Some NIC drivers support DEV_TX_OFFLOAD_MBUF_FAST_FREE offload(
Device supports optimization for fast release of mbufs.When set
application must guarantee that per-queue all mbufs comes from the
same mempool and has refcnt = 1).In order to adapt to this offload
function,we need to modify the existing fragment logic(attach mbuf,
so it is fast,we can call it fast fragment mode) and add the fragment
logic in the non-attach mbuf mode(slow fragment mode).Add some test
data for this modification.

Signed-off-by: Huichao Cai <chcchc88@163.com>
---
 app/test/test_ipfrag.c               | 14 +++++++--
 lib/ip_frag/rte_ipv4_fragmentation.c | 56 +++++++++++++++++++++++++-----------
 2 files changed, 51 insertions(+), 19 deletions(-)
  

Comments

Konstantin Ananyev June 3, 2022, 11:50 p.m. UTC | #1
> Some NIC drivers support DEV_TX_OFFLOAD_MBUF_FAST_FREE offload(
> Device supports optimization for fast release of mbufs.When set
> application must guarantee that per-queue all mbufs comes from the
> same mempool and has refcnt = 1).In order to adapt to this offload
> function,we need to modify the existing fragment logic(attach mbuf,
> so it is fast,we can call it fast fragment mode) and add the fragment
> logic in the non-attach mbuf mode(slow fragment mode).Add some test
> data for this modification.


That doesn't look like  a good idea to me.
Yes, drivers configured with MBUF_FAST_FREE would not work
correctly with indirect mbufs.
So it is application responsibility to choose what it wants:
either indirect mbufs enabled or MBUF_FAST_FREE enabled.
Inside the lib we shouldn't try to guess what was particular
driver configuration.
Plus it is the change (and regression) of existing behavior -
right now it is perfectly fine to use the same mbuf for both
direct and indirect mbufs.
If you really have a use-case for doing fragmentation via
full copying all packet data, then probably the easiest and
safest way would be to introduce new function:
rte_ipv4_fragment_clone_packet(...) or so.


> 
> Signed-off-by: Huichao Cai <chcchc88@163.com>
> ---
>   app/test/test_ipfrag.c               | 14 +++++++--
>   lib/ip_frag/rte_ipv4_fragmentation.c | 56 +++++++++++++++++++++++++-----------
>   2 files changed, 51 insertions(+), 19 deletions(-)
> 
> diff --git a/app/test/test_ipfrag.c b/app/test/test_ipfrag.c
> index 610a86b..f5fe4b8 100644
> --- a/app/test/test_ipfrag.c
> +++ b/app/test/test_ipfrag.c
> @@ -407,12 +407,20 @@ static void ut_teardown(void)
>   					      pktid);
>   		}
>   
> -		if (tests[i].ipv == 4)
> -			len = rte_ipv4_fragment_packet(b, pkts_out, BURST,
> +		if (tests[i].ipv == 4) {
> +			if (i % 2)
> +				len = rte_ipv4_fragment_packet(b,
> +						       pkts_out, BURST,
>   						       tests[i].mtu_size,
>   						       direct_pool,
>   						       indirect_pool);
> -		else if (tests[i].ipv == 6)
> +			else
> +				len = rte_ipv4_fragment_packet(b,
> +						       pkts_out, BURST,
> +							   tests[i].mtu_size,
> +							   direct_pool,
> +							   direct_pool);
> +		} else if (tests[i].ipv == 6)
>   			len = rte_ipv6_fragment_packet(b, pkts_out, BURST,
>   						       tests[i].mtu_size,
>   						       direct_pool,
> diff --git a/lib/ip_frag/rte_ipv4_fragmentation.c b/lib/ip_frag/rte_ipv4_fragmentation.c
> index a562424..65bfad7 100644
> --- a/lib/ip_frag/rte_ipv4_fragmentation.c
> +++ b/lib/ip_frag/rte_ipv4_fragmentation.c
> @@ -102,6 +102,11 @@ static inline uint16_t __create_ipopt_frag_hdr(uint8_t *iph,
>    *   MBUF pool used for allocating direct buffers for the output fragments.
>    * @param pool_indirect
>    *   MBUF pool used for allocating indirect buffers for the output fragments.
> + *   If pool_indirect == pool_direct,this means that the fragment will adapt
> + *   to DEV_TX_OFFLOAD_MBUF_FAST_FREE offload.
> + *   DEV_TX_OFFLOAD_MBUF_FAST_FREE: Device supports optimization
> + *   for fast release of mbufs. When set application must guarantee that
> + *   per-queue all mbufs comes from the same mempool and has refcnt = 1.
>    * @return
>    *   Upon successful completion - number of output fragments placed
>    *   in the pkts_out array.
> @@ -123,6 +128,7 @@ static inline uint16_t __create_ipopt_frag_hdr(uint8_t *iph,
>   	uint16_t frag_bytes_remaining;
>   	uint8_t ipopt_frag_hdr[IPV4_HDR_MAX_LEN];
>   	uint16_t ipopt_len;
> +	bool is_fast_frag_mode = true;
>   
>   	/*
>   	 * Formal parameter checking.
> @@ -133,6 +139,9 @@ static inline uint16_t __create_ipopt_frag_hdr(uint8_t *iph,
>   	    unlikely(mtu_size < RTE_ETHER_MIN_MTU))
>   		return -EINVAL;
>   
> +	if (pool_indirect == pool_direct)
> +		is_fast_frag_mode = false;
> +
>   	in_hdr = rte_pktmbuf_mtod(pkt_in, struct rte_ipv4_hdr *);
>   	header_len = (in_hdr->version_ihl & RTE_IPV4_HDR_IHL_MASK) *
>   	    RTE_IPV4_IHL_MULTIPLIER;
> @@ -190,30 +199,45 @@ static inline uint16_t __create_ipopt_frag_hdr(uint8_t *iph,
>   		out_seg_prev = out_pkt;
>   		more_out_segs = 1;
>   		while (likely(more_out_segs && more_in_segs)) {
> -			struct rte_mbuf *out_seg = NULL;
>   			uint32_t len;
>   
> -			/* Allocate indirect buffer */
> -			out_seg = rte_pktmbuf_alloc(pool_indirect);
> -			if (unlikely(out_seg == NULL)) {
> -				rte_pktmbuf_free(out_pkt);
> -				__free_fragments(pkts_out, out_pkt_pos);
> -				return -ENOMEM;
> -			}
> -			out_seg_prev->next = out_seg;
> -			out_seg_prev = out_seg;
> -
> -			/* Prepare indirect buffer */
> -			rte_pktmbuf_attach(out_seg, in_seg);
>   			len = frag_bytes_remaining;
>   			if (len > (in_seg->data_len - in_seg_data_pos)) {
>   				len = in_seg->data_len - in_seg_data_pos;
>   			}
> -			out_seg->data_off = in_seg->data_off + in_seg_data_pos;
> -			out_seg->data_len = (uint16_t)len;
> +
> +			if (is_fast_frag_mode) {
> +				struct rte_mbuf *out_seg = NULL;
> +				/* Allocate indirect buffer */
> +				out_seg = rte_pktmbuf_alloc(pool_indirect);
> +				if (unlikely(out_seg == NULL)) {
> +					rte_pktmbuf_free(out_pkt);
> +					__free_fragments(pkts_out, out_pkt_pos);
> +					return -ENOMEM;
> +				}
> +				out_seg_prev->next = out_seg;
> +				out_seg_prev = out_seg;
> +
> +				/* Prepare indirect buffer */
> +				rte_pktmbuf_attach(out_seg, in_seg);
> +
> +				out_seg->data_off = in_seg->data_off +
> +					in_seg_data_pos;
> +				out_seg->data_len = (uint16_t)len;
> +				out_pkt->nb_segs += 1;
> +			} else {
> +				rte_memcpy(
> +				    rte_pktmbuf_mtod_offset(out_pkt, char *,
> +					    out_pkt->pkt_len),
> +				    rte_pktmbuf_mtod_offset(in_seg, char *,
> +					    in_seg_data_pos),
> +				    len);
> +				out_pkt->data_len = (uint16_t)(len +
> +				    out_pkt->data_len);
> +			}
> +
>   			out_pkt->pkt_len = (uint16_t)(len +
>   			    out_pkt->pkt_len);
> -			out_pkt->nb_segs += 1;
>   			in_seg_data_pos += len;
>   			frag_bytes_remaining -= len;
>
  
Huichao Cai June 4, 2022, 2:13 a.m. UTC | #2
Thank you for your reply, Konstantin.
Both MBUF_FAST_FREE and indirect mbufs are for performance reasons, 
but they cannot be used at the same time, which is indeed a pity. In a real-world
application, I don't know which method is more conducive to the optimization of
overall performance, so I want to add a choice for the application, do you think it
is necessary(like rte_ipv4_fragment_clone_packet)?


Huichao,Cai
  
Huichao Cai June 4, 2022, 2:19 a.m. UTC | #3
I've seen some applications that have to rewrite fragment
functions themselves in order to use MBUF_FAST_FREE
features, such as iQiYi's DPVS.
  
Konstantin Ananyev June 4, 2022, 11:36 a.m. UTC | #4
04/06/2022 03:19, Huichao Cai пишет:
> I've seen some applications that have to rewrite fragment
> functions themselves in order to use MBUF_FAST_FREE
> features, such as iQiYi's DPVS.
> 
> 


I am not sure that it will really help to improve performance,
as if you have a lot of packets to fragment, you'll probably
spend more time copying them.
Might be it will help somehow if you'll have
very rare occurrence of such packets.
Also please keep in mind, that ip_frag is not the only
one that does use indirect mbufs and refcnt.
As another example - GSO implementation.
So application writer has to be extremely careful
when enabling MBUF_FAST_FREE.
My personal advice - just don't use it,
though I am quite conservative here.

Anyway, as I said before, if there is a real use-case for it -
I am ok to introduce new function that would do copying
while fragmenting.

Konstantin
  

Patch

diff --git a/app/test/test_ipfrag.c b/app/test/test_ipfrag.c
index 610a86b..f5fe4b8 100644
--- a/app/test/test_ipfrag.c
+++ b/app/test/test_ipfrag.c
@@ -407,12 +407,20 @@  static void ut_teardown(void)
 					      pktid);
 		}
 
-		if (tests[i].ipv == 4)
-			len = rte_ipv4_fragment_packet(b, pkts_out, BURST,
+		if (tests[i].ipv == 4) {
+			if (i % 2)
+				len = rte_ipv4_fragment_packet(b,
+						       pkts_out, BURST,
 						       tests[i].mtu_size,
 						       direct_pool,
 						       indirect_pool);
-		else if (tests[i].ipv == 6)
+			else
+				len = rte_ipv4_fragment_packet(b,
+						       pkts_out, BURST,
+							   tests[i].mtu_size,
+							   direct_pool,
+							   direct_pool);
+		} else if (tests[i].ipv == 6)
 			len = rte_ipv6_fragment_packet(b, pkts_out, BURST,
 						       tests[i].mtu_size,
 						       direct_pool,
diff --git a/lib/ip_frag/rte_ipv4_fragmentation.c b/lib/ip_frag/rte_ipv4_fragmentation.c
index a562424..65bfad7 100644
--- a/lib/ip_frag/rte_ipv4_fragmentation.c
+++ b/lib/ip_frag/rte_ipv4_fragmentation.c
@@ -102,6 +102,11 @@  static inline uint16_t __create_ipopt_frag_hdr(uint8_t *iph,
  *   MBUF pool used for allocating direct buffers for the output fragments.
  * @param pool_indirect
  *   MBUF pool used for allocating indirect buffers for the output fragments.
+ *   If pool_indirect == pool_direct,this means that the fragment will adapt
+ *   to DEV_TX_OFFLOAD_MBUF_FAST_FREE offload.
+ *   DEV_TX_OFFLOAD_MBUF_FAST_FREE: Device supports optimization
+ *   for fast release of mbufs. When set application must guarantee that
+ *   per-queue all mbufs comes from the same mempool and has refcnt = 1.
  * @return
  *   Upon successful completion - number of output fragments placed
  *   in the pkts_out array.
@@ -123,6 +128,7 @@  static inline uint16_t __create_ipopt_frag_hdr(uint8_t *iph,
 	uint16_t frag_bytes_remaining;
 	uint8_t ipopt_frag_hdr[IPV4_HDR_MAX_LEN];
 	uint16_t ipopt_len;
+	bool is_fast_frag_mode = true;
 
 	/*
 	 * Formal parameter checking.
@@ -133,6 +139,9 @@  static inline uint16_t __create_ipopt_frag_hdr(uint8_t *iph,
 	    unlikely(mtu_size < RTE_ETHER_MIN_MTU))
 		return -EINVAL;
 
+	if (pool_indirect == pool_direct)
+		is_fast_frag_mode = false;
+
 	in_hdr = rte_pktmbuf_mtod(pkt_in, struct rte_ipv4_hdr *);
 	header_len = (in_hdr->version_ihl & RTE_IPV4_HDR_IHL_MASK) *
 	    RTE_IPV4_IHL_MULTIPLIER;
@@ -190,30 +199,45 @@  static inline uint16_t __create_ipopt_frag_hdr(uint8_t *iph,
 		out_seg_prev = out_pkt;
 		more_out_segs = 1;
 		while (likely(more_out_segs && more_in_segs)) {
-			struct rte_mbuf *out_seg = NULL;
 			uint32_t len;
 
-			/* Allocate indirect buffer */
-			out_seg = rte_pktmbuf_alloc(pool_indirect);
-			if (unlikely(out_seg == NULL)) {
-				rte_pktmbuf_free(out_pkt);
-				__free_fragments(pkts_out, out_pkt_pos);
-				return -ENOMEM;
-			}
-			out_seg_prev->next = out_seg;
-			out_seg_prev = out_seg;
-
-			/* Prepare indirect buffer */
-			rte_pktmbuf_attach(out_seg, in_seg);
 			len = frag_bytes_remaining;
 			if (len > (in_seg->data_len - in_seg_data_pos)) {
 				len = in_seg->data_len - in_seg_data_pos;
 			}
-			out_seg->data_off = in_seg->data_off + in_seg_data_pos;
-			out_seg->data_len = (uint16_t)len;
+
+			if (is_fast_frag_mode) {
+				struct rte_mbuf *out_seg = NULL;
+				/* Allocate indirect buffer */
+				out_seg = rte_pktmbuf_alloc(pool_indirect);
+				if (unlikely(out_seg == NULL)) {
+					rte_pktmbuf_free(out_pkt);
+					__free_fragments(pkts_out, out_pkt_pos);
+					return -ENOMEM;
+				}
+				out_seg_prev->next = out_seg;
+				out_seg_prev = out_seg;
+
+				/* Prepare indirect buffer */
+				rte_pktmbuf_attach(out_seg, in_seg);
+
+				out_seg->data_off = in_seg->data_off +
+					in_seg_data_pos;
+				out_seg->data_len = (uint16_t)len;
+				out_pkt->nb_segs += 1;
+			} else {
+				rte_memcpy(
+				    rte_pktmbuf_mtod_offset(out_pkt, char *,
+					    out_pkt->pkt_len),
+				    rte_pktmbuf_mtod_offset(in_seg, char *,
+					    in_seg_data_pos),
+				    len);
+				out_pkt->data_len = (uint16_t)(len +
+				    out_pkt->data_len);
+			}
+
 			out_pkt->pkt_len = (uint16_t)(len +
 			    out_pkt->pkt_len);
-			out_pkt->nb_segs += 1;
 			in_seg_data_pos += len;
 			frag_bytes_remaining -= len;