[2/3] net/tap: Fixed RSS algorithm to support fragmented packets

Message ID 20231031220921.96023-3-stephen@networkplumber.org (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers
Series net/tap: update and fix the BPF program |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Stephen Hemminger Oct. 31, 2023, 10:08 p.m. UTC
  From: Madhuker Mythri <madhuker.mythri@oracle.com>

As per analysis on Tap PMD, the existing RSS algorithm considering
4-tuple(Src-IP, Dst-IP, Src-port and Dst-port) and identification of
fragment packets is not done, thus we are seeing all the fragmented
chunks of single packet differs in RSS hash value and distributed across
multiple queues.
The RSS algorithm assumes that, all the incoming IP packets are based on
L4-protocol(UDP/TCP) and trying to fetch the L4 fields(Src-port and
Dst-port) for each incoming packet, but for the fragmented chunks these
L4-header will not be present(except for first packet) and should not
consider in RSS hash for L4 header fields in-case of fragmented chunks.
Which is a bug in the RSS algorithm implemented in the BPF functionality
under TAP PMD.

So, modified the RSS eBPF C-program and generated the structure of
C-array in the 'tap_bpf_insns.h' file, which is in eBPF byte-code
instructions format.

Bugzilla Id: 870

Signed-off-by: Madhuker Mythri <madhuker.mythri@oracle.com>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 drivers/net/tap/bpf/tap_bpf_program.c | 47 ++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 8 deletions(-)
  

Patch

diff --git a/drivers/net/tap/bpf/tap_bpf_program.c b/drivers/net/tap/bpf/tap_bpf_program.c
index ff6f1606fb..3d431dfa43 100644
--- a/drivers/net/tap/bpf/tap_bpf_program.c
+++ b/drivers/net/tap/bpf/tap_bpf_program.c
@@ -19,6 +19,8 @@ 
 #include "bpf_elf.h"
 #include "../tap_rss.h"
 
+#include "bpf_api.h"
+
 /** Create IPv4 address */
 #define IPv4(a, b, c, d) ((__u32)(((a) & 0xff) << 24) | \
 		(((b) & 0xff) << 16) | \
@@ -132,6 +134,8 @@  rss_l3_l4(struct __sk_buff *skb)
 	__u8 *key = 0;
 	__u32 len;
 	__u32 queue = 0;
+	bool mf = 0;
+	__u16 frag_off = 0;
 
 	rsskey = map_lookup_elem(&map_keys, &key_idx);
 	if (!rsskey) {
@@ -156,6 +160,8 @@  rss_l3_l4(struct __sk_buff *skb)
 			return TC_ACT_OK;
 
 		__u8 *src_dst_addr = data + off + offsetof(struct iphdr, saddr);
+		__u8 *frag_off_addr = data + off + offsetof(struct iphdr, frag_off);
+		__u8 *prot_addr = data + off + offsetof(struct iphdr, protocol);
 		__u8 *src_dst_port = data + off + sizeof(struct iphdr);
 		struct ipv4_l3_l4_tuple v4_tuple = {
 			.src_addr = IPv4(*(src_dst_addr + 0),
@@ -166,11 +172,25 @@  rss_l3_l4(struct __sk_buff *skb)
 					*(src_dst_addr + 5),
 					*(src_dst_addr + 6),
 					*(src_dst_addr + 7)),
-			.sport = PORT(*(src_dst_port + 0),
-					*(src_dst_port + 1)),
-			.dport = PORT(*(src_dst_port + 2),
-					*(src_dst_port + 3)),
+			.sport = 0,
+			.dport = 0,
 		};
+		/** Fetch the L4-payer port numbers only in-case of TCP/UDP
+		 ** and also if the packet is not fragmented. Since fragmented
+		 ** chunks do not have L4 TCP/UDP header.
+		 **/
+		if (*prot_addr == IPPROTO_UDP || *prot_addr == IPPROTO_TCP) {
+			frag_off = PORT(*(frag_off_addr + 0),
+					*(frag_off_addr + 1));
+			mf = frag_off & 0x2000;
+			frag_off = frag_off & 0x1fff;
+			if (mf == 0 && frag_off == 0) {
+				v4_tuple.sport = PORT(*(src_dst_port + 0),
+						*(src_dst_port + 1));
+				v4_tuple.dport = PORT(*(src_dst_port + 2),
+						*(src_dst_port + 3));
+			}
+		}
 		__u8 input_len = sizeof(v4_tuple) / sizeof(__u32);
 		if (rsskey->hash_fields & (1 << HASH_FIELD_IPV4_L3))
 			input_len--;
@@ -183,6 +203,9 @@  rss_l3_l4(struct __sk_buff *skb)
 					offsetof(struct ipv6hdr, saddr);
 		__u8 *src_dst_port = data + off +
 					sizeof(struct ipv6hdr);
+		__u8 *next_hdr = data + off +
+					offsetof(struct ipv6hdr, nexthdr);
+
 		struct ipv6_l3_l4_tuple v6_tuple;
 		for (j = 0; j < 4; j++)
 			*((uint32_t *)&v6_tuple.src_addr + j) =
@@ -192,10 +215,18 @@  rss_l3_l4(struct __sk_buff *skb)
 			*((uint32_t *)&v6_tuple.dst_addr + j) =
 				__builtin_bswap32(*((uint32_t *)
 						src_dst_addr + 4 + j));
-		v6_tuple.sport = PORT(*(src_dst_port + 0),
-			      *(src_dst_port + 1));
-		v6_tuple.dport = PORT(*(src_dst_port + 2),
-			      *(src_dst_port + 3));
+
+		/** Fetch the L4 header port-numbers only if next-header
+		 * is TCP/UDP **/
+		if (*next_hdr == IPPROTO_UDP || *next_hdr == IPPROTO_TCP) {
+			v6_tuple.sport = PORT(*(src_dst_port + 0),
+				      *(src_dst_port + 1));
+			v6_tuple.dport = PORT(*(src_dst_port + 2),
+				      *(src_dst_port + 3));
+		} else {
+			v6_tuple.sport = 0;
+			v6_tuple.dport = 0;
+		}
 
 		__u8 input_len = sizeof(v6_tuple) / sizeof(__u32);
 		if (rsskey->hash_fields & (1 << HASH_FIELD_IPV6_L3))